]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Merge remote-tracking branch 'origin/x86/mm' into x86/mm2
authorH. Peter Anvin <hpa@linux.intel.com>
Fri, 1 Feb 2013 10:25:06 +0000 (02:25 -0800)
committerH. Peter Anvin <hpa@linux.intel.com>
Fri, 1 Feb 2013 10:28:36 +0000 (02:28 -0800)
Explicitly merging these two branches due to nontrivial conflicts and
to allow further work.

Resolved Conflicts:
arch/x86/kernel/head32.c
arch/x86/kernel/head64.c
arch/x86/mm/init_64.c
arch/x86/realmode/init.c

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
61 files changed:
Documentation/kernel-parameters.txt
Documentation/x86/boot.txt
Documentation/x86/zero-page.txt
arch/mips/cavium-octeon/dma-octeon.c
arch/sparc/mm/init_64.c
arch/x86/boot/boot.h
arch/x86/boot/cmdline.c
arch/x86/boot/compressed/cmdline.c
arch/x86/boot/compressed/head_64.S
arch/x86/boot/compressed/misc.c
arch/x86/boot/compressed/misc.h
arch/x86/boot/header.S
arch/x86/boot/setup.ld
arch/x86/include/asm/bootparam_utils.h [new file with mode: 0644]
arch/x86/include/asm/init.h
arch/x86/include/asm/kexec.h
arch/x86/include/asm/numa.h
arch/x86/include/asm/numa_64.h [deleted file]
arch/x86/include/asm/page.h
arch/x86/include/asm/page_types.h
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_64_types.h
arch/x86/include/asm/pgtable_types.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/realmode.h
arch/x86/include/asm/x86_init.h
arch/x86/include/uapi/asm/bootparam.h
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/amd_gart_64.c
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/cpu/intel.c
arch/x86/kernel/e820.c
arch/x86/kernel/head32.c
arch/x86/kernel/head64.c
arch/x86/kernel/head_64.S
arch/x86/kernel/machine_kexec_64.c
arch/x86/kernel/setup.c
arch/x86/kernel/traps.c
arch/x86/kernel/x86_init.c
arch/x86/mm/init.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/x86/mm/mm_internal.h [new file with mode: 0644]
arch/x86/mm/numa_64.c
arch/x86/mm/pageattr.c
arch/x86/platform/efi/efi.c
arch/x86/power/hibernate_64.c
arch/x86/realmode/init.c
arch/x86/tools/relocs.c
arch/x86/xen/mmu.c
drivers/xen/swiotlb-xen.c
include/linux/bootmem.h
include/linux/kexec.h
include/linux/memblock.h
include/linux/mm.h
include/linux/swiotlb.h
kernel/kexec.c
lib/swiotlb.c
mm/bootmem.c
mm/memblock.c
mm/nobootmem.c

index 363e348bff9b93598587710de952a22c89f12bbe..da0e0773ca96d8107b3b72bfbf59a390085beee0 100644 (file)
@@ -594,6 +594,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        is selected automatically. Check
                        Documentation/kdump/kdump.txt for further details.
 
+       crashkernel_low=size[KMG]
+                       [KNL, x86] parts under 4G.
+
        crashkernel=range1:size1[,range2:size2,...][@offset]
                        [KNL] Same as above, but depends on the memory
                        in the running system. The syntax of range is
index 406d82d5d2bb1e08a8cb9e4c548ce937f448b376..0e383169839a94ddbc7e4b413b1ef5ddacec83ec 100644 (file)
@@ -57,6 +57,10 @@ Protocol 2.10:       (Kernel 2.6.31) Added a protocol for relaxed alignment
 Protocol 2.11: (Kernel 3.6) Added a field for offset of EFI handover
                protocol entry point.
 
+Protocol 2.12: (Kernel 3.9) Added the xloadflags field and extension fields
+               to struct boot_params for for loading bzImage and ramdisk
+               above 4G in 64bit.
+
 **** MEMORY LAYOUT
 
 The traditional memory map for the kernel loader, used for Image or
@@ -182,7 +186,7 @@ Offset      Proto   Name            Meaning
 0230/4 2.05+   kernel_alignment Physical addr alignment required for kernel
 0234/1 2.05+   relocatable_kernel Whether kernel is relocatable or not
 0235/1 2.10+   min_alignment   Minimum alignment, as a power of two
-0236/2 N/A     pad3            Unused
+0236/2 2.12+   xloadflags      Boot protocol option flags
 0238/4 2.06+   cmdline_size    Maximum size of the kernel command line
 023C/4 2.07+   hardware_subarch Hardware subarchitecture
 0240/8 2.07+   hardware_subarch_data Subarchitecture-specific data
@@ -582,6 +586,27 @@ Protocol:  2.10+
   misaligned kernel.  Therefore, a loader should typically try each
   power-of-two alignment from kernel_alignment down to this alignment.
 
+Field name:     xloadflags
+Type:           read
+Offset/size:    0x236/2
+Protocol:       2.12+
+
+  This field is a bitmask.
+
+  Bit 0 (read):        XLF_KERNEL_64
+       - If 1, this kernel has the legacy 64-bit entry point at 0x200.
+
+  Bit 1 (read): XLF_CAN_BE_LOADED_ABOVE_4G
+        - If 1, kernel/boot_params/cmdline/ramdisk can be above 4G.
+
+  Bit 2 (read):        XLF_EFI_HANDOVER_32
+       - If 1, the kernel supports the 32-bit EFI handoff entry point
+          given at handover_offset.
+
+  Bit 3 (read): XLF_EFI_HANDOVER_64
+       - If 1, the kernel supports the 64-bit EFI handoff entry point
+          given at handover_offset + 0x200.
+
 Field name:    cmdline_size
 Type:          read
 Offset/size:   0x238/4
@@ -1029,6 +1054,44 @@ must have read/write permission; CS must be __BOOT_CS and DS, ES, SS
 must be __BOOT_DS; interrupt must be disabled; %esi must hold the base
 address of the struct boot_params; %ebp, %edi and %ebx must be zero.
 
+**** 64-bit BOOT PROTOCOL
+
+For machine with 64bit cpus and 64bit kernel, we could use 64bit bootloader
+and we need a 64-bit boot protocol.
+
+In 64-bit boot protocol, the first step in loading a Linux kernel
+should be to setup the boot parameters (struct boot_params,
+traditionally known as "zero page"). The memory for struct boot_params
+could be allocated anywhere (even above 4G) and initialized to all zero.
+Then, the setup header at offset 0x01f1 of kernel image on should be
+loaded into struct boot_params and examined. The end of setup header
+can be calculated as follows:
+
+       0x0202 + byte value at offset 0x0201
+
+In addition to read/modify/write the setup header of the struct
+boot_params as that of 16-bit boot protocol, the boot loader should
+also fill the additional fields of the struct boot_params as described
+in zero-page.txt.
+
+After setting up the struct boot_params, the boot loader can load
+64-bit kernel in the same way as that of 16-bit boot protocol, but
+kernel could be loaded above 4G.
+
+In 64-bit boot protocol, the kernel is started by jumping to the
+64-bit kernel entry point, which is the start address of loaded
+64-bit kernel plus 0x200.
+
+At entry, the CPU must be in 64-bit mode with paging enabled.
+The range with setup_header.init_size from start address of loaded
+kernel and zero page and command line buffer get ident mapping;
+a GDT must be loaded with the descriptors for selectors
+__BOOT_CS(0x10) and __BOOT_DS(0x18); both descriptors must be 4G flat
+segment; __BOOT_CS must have execute/read permission, and __BOOT_DS
+must have read/write permission; CS must be __BOOT_CS and DS, ES, SS
+must be __BOOT_DS; interrupt must be disabled; %rsi must hold the base
+address of the struct boot_params.
+
 **** EFI HANDOVER PROTOCOL
 
 This protocol allows boot loaders to defer initialisation to the EFI
index cf5437deda81a003864f9e8a5f4e533d276a189d..199f453cb4de10016030c2dd230fb9a3a3125cee 100644 (file)
@@ -19,6 +19,9 @@ Offset        Proto   Name            Meaning
 090/010        ALL     hd1_info        hd1 disk parameter, OBSOLETE!!
 0A0/010        ALL     sys_desc_table  System description table (struct sys_desc_table)
 0B0/010        ALL     olpc_ofw_header OLPC's OpenFirmware CIF and friends
+0C0/004        ALL     ext_ramdisk_image ramdisk_image high 32bits
+0C4/004        ALL     ext_ramdisk_size  ramdisk_size high 32bits
+0C8/004        ALL     ext_cmd_line_ptr  cmd_line_ptr high 32bits
 140/080        ALL     edid_info       Video mode setup (struct edid_info)
 1C0/020        ALL     efi_info        EFI 32 information (struct efi_info)
 1E0/004        ALL     alk_mem_k       Alternative mem check, in KB
@@ -27,6 +30,7 @@ Offset        Proto   Name            Meaning
 1E9/001        ALL     eddbuf_entries  Number of entries in eddbuf (below)
 1EA/001        ALL     edd_mbr_sig_buf_entries Number of entries in edd_mbr_sig_buffer
                                (below)
+1EF/001        ALL     sentinel        Used to detect broken bootloaders
 290/040        ALL     edd_mbr_sig_buffer EDD MBR signatures
 2D0/A00        ALL     e820_map        E820 memory map table
                                (array of struct e820entry)
index 41dd008849757f2cbeec99f00cdf7193e80ac874..02f244475207f1ed6be645d644867595b56eacbf 100644 (file)
@@ -317,7 +317,8 @@ void __init plat_swiotlb_setup(void)
 
        octeon_swiotlb = alloc_bootmem_low_pages(swiotlbsize);
 
-       swiotlb_init_with_tbl(octeon_swiotlb, swiotlb_nslabs, 1);
+       if (swiotlb_init_with_tbl(octeon_swiotlb, swiotlb_nslabs, 1) == -ENOMEM)
+               panic("Cannot allocate SWIOTLB buffer");
 
        mips_dma_map_ops = &octeon_linear_dma_map_ops.dma_map_ops;
 }
index c3b72423c846465373e6c06ab961ac99efce4ea6..fc5a7c4bd9e8dab06e452589df3ab8b950d0ca66 100644 (file)
@@ -2021,6 +2021,16 @@ static void __init patch_tlb_miss_handler_bitmap(void)
        flushi(&valid_addr_bitmap_insn[0]);
 }
 
+static void __init register_page_bootmem_info(void)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+       int i;
+
+       for_each_online_node(i)
+               if (NODE_DATA(i)->node_spanned_pages)
+                       register_page_bootmem_info_node(NODE_DATA(i));
+#endif
+}
 void __init mem_init(void)
 {
        unsigned long codepages, datapages, initpages;
@@ -2038,20 +2048,8 @@ void __init mem_init(void)
 
        high_memory = __va(last_valid_pfn << PAGE_SHIFT);
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-       {
-               int i;
-               for_each_online_node(i) {
-                       if (NODE_DATA(i)->node_spanned_pages != 0) {
-                               totalram_pages +=
-                                       free_all_bootmem_node(NODE_DATA(i));
-                       }
-               }
-               totalram_pages += free_low_memory_core_early(MAX_NUMNODES);
-       }
-#else
+       register_page_bootmem_info();
        totalram_pages = free_all_bootmem();
-#endif
 
        /* We subtract one to account for the mem_map_zero page
         * allocated below.
index 18997e5a1053128369bb22029d762f9dde204874..5b7531966b8466e00ba6d909032ad9a9407e318a 100644 (file)
@@ -285,16 +285,26 @@ struct biosregs {
 void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
 
 /* cmdline.c */
-int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize);
-int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option);
+int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize);
+int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option);
 static inline int cmdline_find_option(const char *option, char *buffer, int bufsize)
 {
-       return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize);
+       unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+       if (cmd_line_ptr >= 0x100000)
+               return -1;      /* inaccessible */
+
+       return __cmdline_find_option(cmd_line_ptr, option, buffer, bufsize);
 }
 
 static inline int cmdline_find_option_bool(const char *option)
 {
-       return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option);
+       unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+       if (cmd_line_ptr >= 0x100000)
+               return -1;      /* inaccessible */
+
+       return __cmdline_find_option_bool(cmd_line_ptr, option);
 }
 
 
index 6b3b6f708c04262c02a788f815d1feac7d25786b..625d21b0cd3fd581449a6f23cc1ca59777ff82b3 100644 (file)
@@ -27,7 +27,7 @@ static inline int myisspace(u8 c)
  * Returns the length of the argument (regardless of if it was
  * truncated to fit in the buffer), or -1 on not found.
  */
-int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize)
+int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize)
 {
        addr_t cptr;
        char c;
@@ -41,8 +41,8 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int
                st_bufcpy       /* Copying this to buffer */
        } state = st_wordstart;
 
-       if (!cmdline_ptr || cmdline_ptr >= 0x100000)
-               return -1;      /* No command line, or inaccessible */
+       if (!cmdline_ptr)
+               return -1;      /* No command line */
 
        cptr = cmdline_ptr & 0xf;
        set_fs(cmdline_ptr >> 4);
@@ -99,7 +99,7 @@ int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int
  * Returns the position of that option (starts counting with 1)
  * or 0 on not found
  */
-int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option)
+int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option)
 {
        addr_t cptr;
        char c;
@@ -111,8 +111,8 @@ int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option)
                st_wordskip,    /* Miscompare, skip */
        } state = st_wordstart;
 
-       if (!cmdline_ptr || cmdline_ptr >= 0x100000)
-               return -1;      /* No command line, or inaccessible */
+       if (!cmdline_ptr)
+               return -1;      /* No command line */
 
        cptr = cmdline_ptr & 0xf;
        set_fs(cmdline_ptr >> 4);
index 10f6b1178c683cb1901a0704b25e653f949e0c31..bffd73b45b1f27a4dc6cdd2ab6ee5451cd1bbd0d 100644 (file)
@@ -13,13 +13,21 @@ static inline char rdfs8(addr_t addr)
        return *((char *)(fs + addr));
 }
 #include "../cmdline.c"
+static unsigned long get_cmd_line_ptr(void)
+{
+       unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr;
+
+       cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32;
+
+       return cmd_line_ptr;
+}
 int cmdline_find_option(const char *option, char *buffer, int bufsize)
 {
-       return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize);
+       return __cmdline_find_option(get_cmd_line_ptr(), option, buffer, bufsize);
 }
 int cmdline_find_option_bool(const char *option)
 {
-       return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option);
+       return __cmdline_find_option_bool(get_cmd_line_ptr(), option);
 }
 
 #endif
index 2c4b171eec337619e8f2dab2b3c3b2048e622e51..d9ae9a4ffcb981ea165ccec893c8e3d28bbe1ff3 100644 (file)
        __HEAD
        .code32
 ENTRY(startup_32)
+       /*
+        * 32bit entry is 0 and it is ABI so immutable!
+        * If we come here directly from a bootloader,
+        * kernel(text+data+bss+brk) ramdisk, zero_page, command line
+        * all need to be under the 4G limit.
+        */
        cld
        /*
         * Test KEEP_SEGMENTS flag to see if the bootloader is asking
@@ -154,6 +160,12 @@ ENTRY(startup_32)
        btsl    $_EFER_LME, %eax
        wrmsr
 
+       /* After gdt is loaded */
+       xorl    %eax, %eax
+       lldt    %ax
+       movl    $0x20, %eax
+       ltr     %ax
+
        /*
         * Setup for the jump to 64bit mode
         *
@@ -176,28 +188,18 @@ ENTRY(startup_32)
        lret
 ENDPROC(startup_32)
 
-no_longmode:
-       /* This isn't an x86-64 CPU so hang */
-1:
-       hlt
-       jmp     1b
-
-#include "../../kernel/verify_cpu.S"
-
-       /*
-        * Be careful here startup_64 needs to be at a predictable
-        * address so I can export it in an ELF header.  Bootloaders
-        * should look at the ELF header to find this address, as
-        * it may change in the future.
-        */
        .code64
        .org 0x200
 ENTRY(startup_64)
        /*
+        * 64bit entry is 0x200 and it is ABI so immutable!
         * We come here either from startup_32 or directly from a
-        * 64bit bootloader.  If we come here from a bootloader we depend on
-        * an identity mapped page table being provied that maps our
-        * entire text+data+bss and hopefully all of memory.
+        * 64bit bootloader.
+        * If we come here from a bootloader, kernel(text+data+bss+brk),
+        * ramdisk, zero_page, command line could be above 4G.
+        * We depend on an identity mapped page table being provided
+        * that maps our entire kernel(text+data+bss+brk), zero page
+        * and command line.
         */
 #ifdef CONFIG_EFI_STUB
        /*
@@ -247,9 +249,6 @@ preferred_addr:
        movl    %eax, %ss
        movl    %eax, %fs
        movl    %eax, %gs
-       lldt    %ax
-       movl    $0x20, %eax
-       ltr     %ax
 
        /*
         * Compute the decompressed kernel start address.  It is where
@@ -349,6 +348,15 @@ relocated:
  */
        jmp     *%rbp
 
+       .code32
+no_longmode:
+       /* This isn't an x86-64 CPU so hang */
+1:
+       hlt
+       jmp     1b
+
+#include "../../kernel/verify_cpu.S"
+
        .data
 gdt:
        .word   gdt_end - gdt
index 88f7ff6da40442086b6c6664b227f00f97bf2711..7cb56c6ca35154f19d3a1f4242845f8242a37202 100644 (file)
@@ -325,6 +325,8 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
 {
        real_mode = rmode;
 
+       sanitize_boot_params(real_mode);
+
        if (real_mode->screen_info.orig_video_mode == 7) {
                vidmem = (char *) 0xb0000;
                vidport = 0x3b4;
index 0e6dc0ee0eeabd04fc283bce937b734ea42923c7..674019d8e2355901b69de5c55cfb92669c5628be 100644 (file)
@@ -18,6 +18,7 @@
 #include <asm/page.h>
 #include <asm/boot.h>
 #include <asm/bootparam.h>
+#include <asm/bootparam_utils.h>
 
 #define BOOT_BOOT_H
 #include "../ctype.h"
index 8c132a625b94991c179def21973bb8ad3c3a56d5..9ec06a1f6d61b2d64f6520bb68d1b0e047af1276 100644 (file)
@@ -21,6 +21,7 @@
 #include <asm/e820.h>
 #include <asm/page_types.h>
 #include <asm/setup.h>
+#include <asm/bootparam.h>
 #include "boot.h"
 #include "voffset.h"
 #include "zoffset.h"
@@ -255,6 +256,9 @@ section_table:
        # header, from the old boot sector.
 
        .section ".header", "a"
+       .globl  sentinel
+sentinel:      .byte 0xff, 0xff        /* Used to detect broken loaders */
+
        .globl  hdr
 hdr:
 setup_sects:   .byte 0                 /* Filled in by build.c */
@@ -279,7 +283,7 @@ _start:
        # Part 2 of the header, from the old setup.S
 
                .ascii  "HdrS"          # header signature
-               .word   0x020b          # header version number (>= 0x0105)
+               .word   0x020c          # header version number (>= 0x0105)
                                        # or else old loadlin-1.5 will fail)
                .globl realmode_swtch
 realmode_swtch:        .word   0, 0            # default_switch, SETUPSEG
@@ -297,13 +301,7 @@ type_of_loader:    .byte   0               # 0 means ancient bootloader, newer
 
 # flags, unused bits must be zero (RFU) bit within loadflags
 loadflags:
-LOADED_HIGH    = 1                     # If set, the kernel is loaded high
-CAN_USE_HEAP   = 0x80                  # If set, the loader also has set
-                                       # heap_end_ptr to tell how much
-                                       # space behind setup.S can be used for
-                                       # heap purposes.
-                                       # Only the loader knows what is free
-               .byte   LOADED_HIGH
+               .byte   LOADED_HIGH     # The kernel is to be loaded high
 
 setup_move_size: .word  0x8000         # size to move, when setup is not
                                        # loaded at 0x90000. We will move setup
@@ -369,7 +367,31 @@ relocatable_kernel:    .byte 1
 relocatable_kernel:    .byte 0
 #endif
 min_alignment:         .byte MIN_KERNEL_ALIGN_LG2      # minimum alignment
-pad3:                  .word 0
+
+xloadflags:
+#ifdef CONFIG_X86_64
+# define XLF0 XLF_KERNEL_64                    /* 64-bit kernel */
+#else
+# define XLF0 0
+#endif
+
+#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64)
+   /* kernel/boot_param/ramdisk could be loaded above 4g */
+# define XLF1 XLF_CAN_BE_LOADED_ABOVE_4G
+#else
+# define XLF1 0
+#endif
+
+#ifdef CONFIG_EFI_STUB
+# ifdef CONFIG_X86_64
+#  define XLF23 XLF_EFI_HANDOVER_64            /* 64-bit EFI handover ok */
+# else
+#  define XLF23 XLF_EFI_HANDOVER_32            /* 32-bit EFI handover ok */
+# endif
+#else
+# define XLF23 0
+#endif
+                       .word XLF0 | XLF1 | XLF23
 
 cmdline_size:   .long   COMMAND_LINE_SIZE-1     #length of the command line,
                                                 #added with boot protocol
@@ -397,8 +419,13 @@ pref_address:              .quad LOAD_PHYSICAL_ADDR        # preferred load addr
 #define INIT_SIZE VO_INIT_SIZE
 #endif
 init_size:             .long INIT_SIZE         # kernel initialization size
-handover_offset:       .long 0x30              # offset to the handover
+handover_offset:
+#ifdef CONFIG_EFI_STUB
+                       .long 0x30              # offset to the handover
                                                # protocol entry point
+#else
+                       .long 0
+#endif
 
 # End of setup header #####################################################
 
index 03c0683636b6fbf859a8795262f249631e6157f9..96a6c7563538364d2dee7e307846815156c11c33 100644 (file)
@@ -13,7 +13,7 @@ SECTIONS
        .bstext         : { *(.bstext) }
        .bsdata         : { *(.bsdata) }
 
-       . = 497;
+       . = 495;
        .header         : { *(.header) }
        .entrytext      : { *(.entrytext) }
        .inittext       : { *(.inittext) }
diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h
new file mode 100644 (file)
index 0000000..5b5e9cb
--- /dev/null
@@ -0,0 +1,38 @@
+#ifndef _ASM_X86_BOOTPARAM_UTILS_H
+#define _ASM_X86_BOOTPARAM_UTILS_H
+
+#include <asm/bootparam.h>
+
+/*
+ * This file is included from multiple environments.  Do not
+ * add completing #includes to make it standalone.
+ */
+
+/*
+ * Deal with bootloaders which fail to initialize unknown fields in
+ * boot_params to zero.  The list fields in this list are taken from
+ * analysis of kexec-tools; if other broken bootloaders initialize a
+ * different set of fields we will need to figure out how to disambiguate.
+ *
+ */
+static void sanitize_boot_params(struct boot_params *boot_params)
+{
+       if (boot_params->sentinel) {
+               /*fields in boot_params are not valid, clear them */
+               memset(&boot_params->olpc_ofw_header, 0,
+                      (char *)&boot_params->alt_mem_k -
+                       (char *)&boot_params->olpc_ofw_header);
+               memset(&boot_params->kbd_status, 0,
+                      (char *)&boot_params->hdr -
+                      (char *)&boot_params->kbd_status);
+               memset(&boot_params->_pad7[0], 0,
+                      (char *)&boot_params->edd_mbr_sig_buffer[0] -
+                       (char *)&boot_params->_pad7[0]);
+               memset(&boot_params->_pad8[0], 0,
+                      (char *)&boot_params->eddbuf[0] -
+                       (char *)&boot_params->_pad8[0]);
+               memset(&boot_params->_pad9[0], 0, sizeof(boot_params->_pad9));
+       }
+}
+
+#endif /* _ASM_X86_BOOTPARAM_UTILS_H */
index adcc0ae73d0914b4fe7b816c4512d296b290ba70..223042086f4e9aa29498d3f159aaa2b757fa766a 100644 (file)
@@ -1,20 +1,14 @@
-#ifndef _ASM_X86_INIT_32_H
-#define _ASM_X86_INIT_32_H
+#ifndef _ASM_X86_INIT_H
+#define _ASM_X86_INIT_H
 
-#ifdef CONFIG_X86_32
-extern void __init early_ioremap_page_table_range_init(void);
-#endif
+struct x86_mapping_info {
+       void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
+       void *context;                   /* context for alloc_pgt_page */
+       unsigned long pmd_flag;          /* page flag for PMD entry */
+       bool kernel_mapping;             /* kernel mapping or ident mapping */
+};
 
-extern void __init zone_sizes_init(void);
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+                               unsigned long addr, unsigned long end);
 
-extern unsigned long __init
-kernel_physical_mapping_init(unsigned long start,
-                            unsigned long end,
-                            unsigned long page_size_mask);
-
-
-extern unsigned long __initdata pgt_buf_start;
-extern unsigned long __meminitdata pgt_buf_end;
-extern unsigned long __meminitdata pgt_buf_top;
-
-#endif /* _ASM_X86_INIT_32_H */
+#endif /* _ASM_X86_INIT_H */
index 6080d2694bad07fcf3507d7a37231f00a333668e..17483a492f1882f218849bedd1c26d8a970d433f 100644 (file)
 # define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
 #else
 /* Maximum physical address we can use pages from */
-# define KEXEC_SOURCE_MEMORY_LIMIT      (0xFFFFFFFFFFUL)
+# define KEXEC_SOURCE_MEMORY_LIMIT      (MAXMEM-1)
 /* Maximum address we can reach in physical address mode */
-# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
+# define KEXEC_DESTINATION_MEMORY_LIMIT (MAXMEM-1)
 /* Maximum address we can use for the control pages */
-# define KEXEC_CONTROL_MEMORY_LIMIT     (0xFFFFFFFFFFUL)
+# define KEXEC_CONTROL_MEMORY_LIMIT     (MAXMEM-1)
 
 /* Allocate one page for the pdp and the second for the code */
 # define KEXEC_CONTROL_PAGE_SIZE  (4096UL + 4096UL)
index 49119fcea2dc9771505b04928dbbb5800e030eaf..52560a2038e103fd359a4bcbc4e188052de6f2cf 100644 (file)
@@ -54,8 +54,6 @@ static inline int numa_cpu_node(int cpu)
 
 #ifdef CONFIG_X86_32
 # include <asm/numa_32.h>
-#else
-# include <asm/numa_64.h>
 #endif
 
 #ifdef CONFIG_NUMA
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
deleted file mode 100644 (file)
index 0c05f7a..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_X86_NUMA_64_H
-#define _ASM_X86_NUMA_64_H
-
-extern unsigned long numa_free_all_bootmem(void);
-
-#endif /* _ASM_X86_NUMA_64_H */
index 3698a6a0a940cdcf607cab91c0c67b7231922213..c87892442e53d4081f4ff1276524e0298cf73617 100644 (file)
 
 struct page;
 
+#include <linux/range.h>
+extern struct range pfn_mapped[];
+extern int nr_pfn_mapped;
+
 static inline void clear_user_page(void *page, unsigned long vaddr,
                                   struct page *pg)
 {
index e21fdd10479f88e339e1e7351da837c563361635..54c97879195e7c5137d2a45fbda51c40a1c41be2 100644 (file)
@@ -51,6 +51,8 @@ static inline phys_addr_t get_max_mapped(void)
        return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
 }
 
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
+
 extern unsigned long init_memory_mapping(unsigned long start,
                                         unsigned long end);
 
index bc28e6fe705287e29d55da8bc78a07726c8b0150..b6e41b8cd659fa791f8458eead7d622c721c5e52 100644 (file)
@@ -616,6 +616,8 @@ static inline int pgd_none(pgd_t pgd)
 #ifndef __ASSEMBLY__
 
 extern int direct_gbpages;
+void init_mem_mapping(void);
+void early_alloc_pgt_buf(void);
 
 /* local pte updates need not use xchg for locking */
 static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
index 766ea16fbbbda730a9952a67d85c8cda954f62ef..2d883440cb9a282f948d7062684c52789e95d90f 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_PGTABLE_64_DEFS_H
 #define _ASM_X86_PGTABLE_64_DEFS_H
 
+#include <asm/sparsemem.h>
+
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
 
@@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t;
 #define MODULES_END      _AC(0xffffffffff000000, UL)
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
 
+#define EARLY_DYNAMIC_PAGE_TABLES      64
+
 #endif /* _ASM_X86_PGTABLE_64_DEFS_H */
index 9f82690f81ed71a8fbc5b7c25c1cc4107206dfa8..e6423002c10b5211af8fd45cb1cefa120d45c534 100644 (file)
@@ -321,7 +321,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 /* Install a pte for a particular vaddr in kernel space. */
 void set_pte_vaddr(unsigned long vaddr, pte_t pte);
 
-extern void native_pagetable_reserve(u64 start, u64 end);
 #ifdef CONFIG_X86_32
 extern void native_pagetable_init(void);
 #else
index 888184b2fc85c7987a63d365e9bfd1d334eb9372..bdee8bd318eaad5d5e74f72763f63aabb29717d2 100644 (file)
@@ -731,6 +731,7 @@ extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
 
 extern void early_trap_init(void);
+void early_trap_pf_init(void);
 
 /* Defined in head.S */
 extern struct desc_ptr         early_gdt_descr;
index fe1ec5bcd84644b4c4c0b68660be85dae0bd39ad..9c6b890d5e7a0733ed7e92f1d3f1b1aa81bd349f 100644 (file)
@@ -58,6 +58,7 @@ extern unsigned char boot_gdt[];
 extern unsigned char secondary_startup_64[];
 #endif
 
-extern void __init setup_real_mode(void);
+void reserve_real_mode(void);
+void setup_real_mode(void);
 
 #endif /* _ARCH_X86_REALMODE_H */
index 57693498519c4a69962417c8889b376182b49e2f..3b2ce8fc995ac73fa151d7e6abb3de79cd07f5b2 100644 (file)
@@ -68,17 +68,6 @@ struct x86_init_oem {
        void (*banner)(void);
 };
 
-/**
- * struct x86_init_mapping - platform specific initial kernel pagetable setup
- * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
- *
- * For more details on the purpose of this hook, look in
- * init_memory_mapping and the commit that added it.
- */
-struct x86_init_mapping {
-       void (*pagetable_reserve)(u64 start, u64 end);
-};
-
 /**
  * struct x86_init_paging - platform specific paging functions
  * @pagetable_init:    platform specific paging initialization call to setup
@@ -136,7 +125,6 @@ struct x86_init_ops {
        struct x86_init_mpparse         mpparse;
        struct x86_init_irqs            irqs;
        struct x86_init_oem             oem;
-       struct x86_init_mapping         mapping;
        struct x86_init_paging          paging;
        struct x86_init_timers          timers;
        struct x86_init_iommu           iommu;
index 92862cd902012b9fb174d03e6e2cd687787136cf..c15ddaf907107134d6cd2f8d86f554e510a6f848 100644 (file)
@@ -1,6 +1,31 @@
 #ifndef _ASM_X86_BOOTPARAM_H
 #define _ASM_X86_BOOTPARAM_H
 
+/* setup_data types */
+#define SETUP_NONE                     0
+#define SETUP_E820_EXT                 1
+#define SETUP_DTB                      2
+#define SETUP_PCI                      3
+
+/* ram_size flags */
+#define RAMDISK_IMAGE_START_MASK       0x07FF
+#define RAMDISK_PROMPT_FLAG            0x8000
+#define RAMDISK_LOAD_FLAG              0x4000
+
+/* loadflags */
+#define LOADED_HIGH    (1<<0)
+#define QUIET_FLAG     (1<<5)
+#define KEEP_SEGMENTS  (1<<6)
+#define CAN_USE_HEAP   (1<<7)
+
+/* xloadflags */
+#define XLF_KERNEL_64                  (1<<0)
+#define XLF_CAN_BE_LOADED_ABOVE_4G     (1<<1)
+#define XLF_EFI_HANDOVER_32            (1<<2)
+#define XLF_EFI_HANDOVER_64            (1<<3)
+
+#ifndef __ASSEMBLY__
+
 #include <linux/types.h>
 #include <linux/screen_info.h>
 #include <linux/apm_bios.h>
@@ -9,12 +34,6 @@
 #include <asm/ist.h>
 #include <video/edid.h>
 
-/* setup data types */
-#define SETUP_NONE                     0
-#define SETUP_E820_EXT                 1
-#define SETUP_DTB                      2
-#define SETUP_PCI                      3
-
 /* extensible setup data list node */
 struct setup_data {
        __u64 next;
@@ -28,9 +47,6 @@ struct setup_header {
        __u16   root_flags;
        __u32   syssize;
        __u16   ram_size;
-#define RAMDISK_IMAGE_START_MASK       0x07FF
-#define RAMDISK_PROMPT_FLAG            0x8000
-#define RAMDISK_LOAD_FLAG              0x4000
        __u16   vid_mode;
        __u16   root_dev;
        __u16   boot_flag;
@@ -42,10 +58,6 @@ struct setup_header {
        __u16   kernel_version;
        __u8    type_of_loader;
        __u8    loadflags;
-#define LOADED_HIGH    (1<<0)
-#define QUIET_FLAG     (1<<5)
-#define KEEP_SEGMENTS  (1<<6)
-#define CAN_USE_HEAP   (1<<7)
        __u16   setup_move_size;
        __u32   code32_start;
        __u32   ramdisk_image;
@@ -58,7 +70,8 @@ struct setup_header {
        __u32   initrd_addr_max;
        __u32   kernel_alignment;
        __u8    relocatable_kernel;
-       __u8    _pad2[3];
+       __u8    min_alignment;
+       __u16   xloadflags;
        __u32   cmdline_size;
        __u32   hardware_subarch;
        __u64   hardware_subarch_data;
@@ -106,7 +119,10 @@ struct boot_params {
        __u8  hd1_info[16];     /* obsolete! */         /* 0x090 */
        struct sys_desc_table sys_desc_table;           /* 0x0a0 */
        struct olpc_ofw_header olpc_ofw_header;         /* 0x0b0 */
-       __u8  _pad4[128];                               /* 0x0c0 */
+       __u32 ext_ramdisk_image;                        /* 0x0c0 */
+       __u32 ext_ramdisk_size;                         /* 0x0c4 */
+       __u32 ext_cmd_line_ptr;                         /* 0x0c8 */
+       __u8  _pad4[116];                               /* 0x0cc */
        struct edid_info edid_info;                     /* 0x140 */
        struct efi_info efi_info;                       /* 0x1c0 */
        __u32 alt_mem_k;                                /* 0x1e0 */
@@ -115,7 +131,20 @@ struct boot_params {
        __u8  eddbuf_entries;                           /* 0x1e9 */
        __u8  edd_mbr_sig_buf_entries;                  /* 0x1ea */
        __u8  kbd_status;                               /* 0x1eb */
-       __u8  _pad6[5];                                 /* 0x1ec */
+       __u8  _pad5[3];                                 /* 0x1ec */
+       /*
+        * The sentinel is set to a nonzero value (0xff) in header.S.
+        *
+        * A bootloader is supposed to only take setup_header and put
+        * it into a clean boot_params buffer. If it turns out that
+        * it is clumsy or too generous with the buffer, it most
+        * probably will pick up the sentinel variable too. The fact
+        * that this variable then is still 0xff will let kernel
+        * know that some variables in boot_params are invalid and
+        * kernel should zero out certain portions of boot_params.
+        */
+       __u8  sentinel;                                 /* 0x1ef */
+       __u8  _pad6[1];                                 /* 0x1f0 */
        struct setup_header hdr;    /* setup header */  /* 0x1f1 */
        __u8  _pad7[0x290-0x1f1-sizeof(struct setup_header)];
        __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX];      /* 0x290 */
@@ -134,6 +163,6 @@ enum {
        X86_NR_SUBARCHS,
 };
 
-
+#endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_BOOTPARAM_H */
index bacf4b0d91f4e0e01c4b7bc7cbcfebe6e1e3a680..cfc755dc1607b0c8d51bef3b2b12a9ffa94b2c25 100644 (file)
@@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled);
 
 #ifdef CONFIG_X86_64
 # include <asm/proto.h>
-# include <asm/numa_64.h>
 #endif                         /* X86 */
 
 #define BAD_MADT_ENTRY(entry, end) (                                       \
index e66311200cbd8ae78274e7f3525d9098cec43ffe..b574b295a2f9922c03673f8784cf72c0a743dbf5 100644 (file)
@@ -768,10 +768,9 @@ int __init gart_iommu_init(void)
        aper_base       = info.aper_base;
        end_pfn         = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
 
-       if (end_pfn > max_low_pfn_mapped) {
-               start_pfn = (aper_base>>PAGE_SHIFT);
+       start_pfn = PFN_DOWN(aper_base);
+       if (!pfn_range_is_mapped(start_pfn, end_pfn))
                init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
-       }
 
        pr_info("PCI-DMA: using GART IOMMU.\n");
        iommu_size = check_iommu_size(info.aper_base, aper_size);
index 15239fffd6fee747913a1f0e493c512885239379..eafb084e80f87e5904fbdd3fff36aecec19d6326 100644 (file)
@@ -12,7 +12,6 @@
 #include <asm/pci-direct.h>
 
 #ifdef CONFIG_X86_64
-# include <asm/numa_64.h>
 # include <asm/mmconfig.h>
 # include <asm/cacheflush.h>
 #endif
@@ -685,12 +684,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
                 * benefit in doing so.
                 */
                if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+                       unsigned long pfn = tseg >> PAGE_SHIFT;
+
                        printk(KERN_DEBUG "tseg: %010llx\n", tseg);
-                       if ((tseg>>PMD_SHIFT) <
-                               (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
-                               ((tseg>>PMD_SHIFT) <
-                               (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
-                               (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
+                       if (pfn_range_is_mapped(pfn, pfn + 1))
                                set_memory_4k((unsigned long)__va(tseg), 1);
                }
        }
index fdfefa27b94832fb672651d922c185ed4fa218d3..1905ce98bee01d667a0b6604d3c18f4d7c34e0fb 100644 (file)
@@ -17,7 +17,6 @@
 
 #ifdef CONFIG_X86_64
 #include <linux/topology.h>
-#include <asm/numa_64.h>
 #endif
 
 #include "cpu.h"
index df06ade26bef8485af1a66d797370385d0012e52..d32abeabbda556ea0c9387a331f15a129bf4a0de 100644 (file)
@@ -835,7 +835,7 @@ static int __init parse_memopt(char *p)
 }
 early_param("mem", parse_memopt);
 
-static int __init parse_memmap_opt(char *p)
+static int __init parse_memmap_one(char *p)
 {
        char *oldp;
        u64 start_at, mem_size;
@@ -877,6 +877,20 @@ static int __init parse_memmap_opt(char *p)
 
        return *p == '\0' ? 0 : -EINVAL;
 }
+static int __init parse_memmap_opt(char *str)
+{
+       while (str) {
+               char *k = strchr(str, ',');
+
+               if (k)
+                       *k++ = 0;
+
+               parse_memmap_one(str);
+               str = k;
+       }
+
+       return 0;
+}
 early_param("memmap", parse_memmap_opt);
 
 void __init finish_e820_parsing(void)
index e175548329915e23ddef8f6dae39fdfe68d73100..138463a24877df63e1fae62ca2870a4819481889 100644 (file)
@@ -18,6 +18,7 @@
 #include <asm/io_apic.h>
 #include <asm/bios_ebda.h>
 #include <asm/tlbflush.h>
+#include <asm/bootparam_utils.h>
 
 static void __init i386_default_early_setup(void)
 {
@@ -30,19 +31,7 @@ static void __init i386_default_early_setup(void)
 
 void __init i386_start_kernel(void)
 {
-       memblock_reserve(__pa_symbol(_text),
-                        (unsigned long)__bss_stop - (unsigned long)_text);
-
-#ifdef CONFIG_BLK_DEV_INITRD
-       /* Reserve INITRD */
-       if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
-               /* Assume only end is not page aligned */
-               u64 ramdisk_image = boot_params.hdr.ramdisk_image;
-               u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
-               u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
-               memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
-       }
-#endif
+       sanitize_boot_params(&boot_params);
 
        /* Call the subarch specific early setup function */
        switch (boot_params.hdr.hardware_subarch) {
@@ -57,11 +46,5 @@ void __init i386_start_kernel(void)
                break;
        }
 
-       /*
-        * At this point everything still needed from the boot loader
-        * or BIOS or kernel text should be early reserved or marked not
-        * RAM in e820. All other memory is free game.
-        */
-
        start_kernel();
 }
index 7b215a50ec1e21efb7c520325d0d039c1ba24844..57334f4cd3af1765a4dd3a63bcf141fe49a6f0f3 100644 (file)
 #include <asm/kdebug.h>
 #include <asm/e820.h>
 #include <asm/bios_ebda.h>
+#include <asm/bootparam_utils.h>
 
-static void __init zap_identity_mappings(void)
+/*
+ * Manage page tables very early on.
+ */
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+static unsigned int __initdata next_early_pgt = 2;
+
+/* Wipe all early page tables except for the kernel symbol map */
+static void __init reset_early_page_tables(void)
+{
+       unsigned long i;
+
+       for (i = 0; i < PTRS_PER_PGD-1; i++)
+               early_level4_pgt[i].pgd = 0;
+
+       next_early_pgt = 0;
+
+       write_cr3(__pa(early_level4_pgt));
+}
+
+/* Create a new PMD entry */
+int __init early_make_pgtable(unsigned long address)
 {
-       pgd_t *pgd = pgd_offset_k(0UL);
-       pgd_clear(pgd);
-       __flush_tlb_all();
+       unsigned long physaddr = address - __PAGE_OFFSET;
+       unsigned long i;
+       pgdval_t pgd, *pgd_p;
+       pudval_t pud, *pud_p;
+       pmdval_t pmd, *pmd_p;
+
+       /* Invalid address or early pgt is done ?  */
+       if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
+               return -1;
+
+again:
+       pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
+       pgd = *pgd_p;
+
+       /*
+        * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
+        * critical -- __PAGE_OFFSET would point us back into the dynamic
+        * range and we might end up looping forever...
+        */
+       if (pgd)
+               pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+       else {
+               if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+                       reset_early_page_tables();
+                       goto again;
+               }
+
+               pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
+               for (i = 0; i < PTRS_PER_PUD; i++)
+                       pud_p[i] = 0;
+               *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+       }
+       pud_p += pud_index(address);
+       pud = *pud_p;
+
+       if (pud)
+               pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+       else {
+               if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+                       reset_early_page_tables();
+                       goto again;
+               }
+
+               pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
+               for (i = 0; i < PTRS_PER_PMD; i++)
+                       pmd_p[i] = 0;
+               *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+       }
+       pmd = (physaddr & PMD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL);
+       pmd_p[pmd_index(address)] = pmd;
+
+       return 0;
 }
 
 /* Don't add a printk in there. printk relies on the PDA which is not initialized 
@@ -41,13 +112,25 @@ static void __init clear_bss(void)
               (unsigned long) __bss_stop - (unsigned long) __bss_start);
 }
 
+static unsigned long get_cmd_line_ptr(void)
+{
+       unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+       cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32;
+
+       return cmd_line_ptr;
+}
+
 static void __init copy_bootdata(char *real_mode_data)
 {
        char * command_line;
+       unsigned long cmd_line_ptr;
 
        memcpy(&boot_params, real_mode_data, sizeof boot_params);
-       if (boot_params.hdr.cmd_line_ptr) {
-               command_line = __va(boot_params.hdr.cmd_line_ptr);
+       sanitize_boot_params(&boot_params);
+       cmd_line_ptr = get_cmd_line_ptr();
+       if (cmd_line_ptr) {
+               command_line = __va(cmd_line_ptr);
                memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
        }
 }
@@ -70,14 +153,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
                                (__START_KERNEL & PGDIR_MASK)));
        BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
 
+       /* Kill off the identity-map trampoline */
+       reset_early_page_tables();
+
        /* clear bss before set_intr_gate with early_idt_handler */
        clear_bss();
 
-       /* Make NULL pointers segfault */
-       zap_identity_mappings();
-
-       max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
-
        for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
 #ifdef CONFIG_EARLY_PRINTK
                set_intr_gate(i, &early_idt_handlers[i]);
@@ -87,37 +168,25 @@ void __init x86_64_start_kernel(char * real_mode_data)
        }
        load_idt((const struct desc_ptr *)&idt_descr);
 
+       copy_bootdata(__va(real_mode_data));
+
        if (console_loglevel == 10)
                early_printk("Kernel alive\n");
 
+       clear_page(init_level4_pgt);
+       /* set init_level4_pgt kernel high mapping*/
+       init_level4_pgt[511] = early_level4_pgt[511];
+
        x86_64_start_reservations(real_mode_data);
 }
 
 void __init x86_64_start_reservations(char *real_mode_data)
 {
-       copy_bootdata(__va(real_mode_data));
-
-       memblock_reserve(__pa_symbol(_text),
-                        (unsigned long)__bss_stop - (unsigned long)_text);
-
-#ifdef CONFIG_BLK_DEV_INITRD
-       /* Reserve INITRD */
-       if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
-               /* Assume only end is not page aligned */
-               unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-               unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-               unsigned long ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
-               memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
-       }
-#endif
+       /* version is always not zero if it is copied */
+       if (!boot_params.hdr.version)
+               copy_bootdata(__va(real_mode_data));
 
        reserve_ebda_region();
 
-       /*
-        * At this point everything still needed from the boot loader
-        * or BIOS or kernel text should be early reserved or marked not
-        * RAM in e820. All other memory is free game.
-        */
-
        start_kernel();
 }
index 980053c4b9cc5e2cef4dc421a209c02f58bd0620..d94f6d68be2a9267d462cdb9c305a36052d4b2cc 100644 (file)
@@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
        .code64
        .globl startup_64
 startup_64:
-
        /*
         * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
         * and someone has loaded an identity mapped page table
         * for us.  These identity mapped page tables map all of the
         * kernel pages and possibly all of memory.
         *
-        * %esi holds a physical pointer to real_mode_data.
+        * %rsi holds a physical pointer to real_mode_data.
         *
         * We come here either directly from a 64bit bootloader, or from
         * arch/x86_64/boot/compressed/head.S.
@@ -66,7 +65,8 @@ startup_64:
         * tables and then reload them.
         */
 
-       /* Compute the delta between the address I am compiled to run at and the
+       /*
+        * Compute the delta between the address I am compiled to run at and the
         * address I am actually running at.
         */
        leaq    _text(%rip), %rbp
@@ -78,45 +78,62 @@ startup_64:
        testl   %eax, %eax
        jnz     bad_address
 
-       /* Is the address too large? */
-       leaq    _text(%rip), %rdx
-       movq    $PGDIR_SIZE, %rax
-       cmpq    %rax, %rdx
-       jae     bad_address
-
-       /* Fixup the physical addresses in the page table
+       /*
+        * Is the address too large?
         */
-       addq    %rbp, init_level4_pgt + 0(%rip)
-       addq    %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip)
-       addq    %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip)
+       leaq    _text(%rip), %rax
+       shrq    $MAX_PHYSMEM_BITS, %rax
+       jnz     bad_address
 
-       addq    %rbp, level3_ident_pgt + 0(%rip)
+       /*
+        * Fixup the physical addresses in the page table
+        */
+       addq    %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
 
        addq    %rbp, level3_kernel_pgt + (510*8)(%rip)
        addq    %rbp, level3_kernel_pgt + (511*8)(%rip)
 
        addq    %rbp, level2_fixmap_pgt + (506*8)(%rip)
 
-       /* Add an Identity mapping if I am above 1G */
+       /*
+        * Set up the identity mapping for the switchover.  These
+        * entries should *NOT* have the global bit set!  This also
+        * creates a bunch of nonsense entries but that is fine --
+        * it avoids problems around wraparound.
+        */
        leaq    _text(%rip), %rdi
-       andq    $PMD_PAGE_MASK, %rdi
+       leaq    early_level4_pgt(%rip), %rbx
 
        movq    %rdi, %rax
-       shrq    $PUD_SHIFT, %rax
-       andq    $(PTRS_PER_PUD - 1), %rax
-       jz      ident_complete
+       shrq    $PGDIR_SHIFT, %rax
 
-       leaq    (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
-       leaq    level3_ident_pgt(%rip), %rbx
-       movq    %rdx, 0(%rbx, %rax, 8)
+       leaq    (4096 + _KERNPG_TABLE)(%rbx), %rdx
+       movq    %rdx, 0(%rbx,%rax,8)
+       movq    %rdx, 8(%rbx,%rax,8)
 
+       addq    $4096, %rdx
        movq    %rdi, %rax
-       shrq    $PMD_SHIFT, %rax
-       andq    $(PTRS_PER_PMD - 1), %rax
-       leaq    __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
-       leaq    level2_spare_pgt(%rip), %rbx
-       movq    %rdx, 0(%rbx, %rax, 8)
-ident_complete:
+       shrq    $PUD_SHIFT, %rax
+       andl    $(PTRS_PER_PUD-1), %eax
+       movq    %rdx, (4096+0)(%rbx,%rax,8)
+       movq    %rdx, (4096+8)(%rbx,%rax,8)
+
+       addq    $8192, %rbx
+       movq    %rdi, %rax
+       shrq    $PMD_SHIFT, %rdi
+       addq    $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
+       leaq    (_end - 1)(%rip), %rcx
+       shrq    $PMD_SHIFT, %rcx
+       subq    %rdi, %rcx
+       incl    %ecx
+
+1:
+       andq    $(PTRS_PER_PMD - 1), %rdi
+       movq    %rax, (%rbx,%rdi,8)
+       incq    %rdi
+       addq    $PMD_SIZE, %rax
+       decl    %ecx
+       jnz     1b
 
        /*
         * Fixup the kernel text+data virtual addresses. Note that
@@ -124,7 +141,6 @@ ident_complete:
         * cleanup_highmap() fixes this up along with the mappings
         * beyond _end.
         */
-
        leaq    level2_kernel_pgt(%rip), %rdi
        leaq    4096(%rdi), %r8
        /* See if it is a valid page table entry */
@@ -139,17 +155,14 @@ ident_complete:
        /* Fixup phys_base */
        addq    %rbp, phys_base(%rip)
 
-       /* Due to ENTRY(), sometimes the empty space gets filled with
-        * zeros. Better take a jmp than relying on empty space being
-        * filled with 0x90 (nop)
-        */
-       jmp secondary_startup_64
+       movq    $(early_level4_pgt - __START_KERNEL_map), %rax
+       jmp 1f
 ENTRY(secondary_startup_64)
        /*
         * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
         * and someone has loaded a mapped page table.
         *
-        * %esi holds a physical pointer to real_mode_data.
+        * %rsi holds a physical pointer to real_mode_data.
         *
         * We come here either from startup_64 (using physical addresses)
         * or from trampoline.S (using virtual addresses).
@@ -159,12 +172,14 @@ ENTRY(secondary_startup_64)
         * after the boot processor executes this code.
         */
 
+       movq    $(init_level4_pgt - __START_KERNEL_map), %rax
+1:
+
        /* Enable PAE mode and PGE */
-       movl    $(X86_CR4_PAE | X86_CR4_PGE), %eax
-       movq    %rax, %cr4
+       movl    $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+       movq    %rcx, %cr4
 
        /* Setup early boot stage 4 level pagetables. */
-       movq    $(init_level4_pgt - __START_KERNEL_map), %rax
        addq    phys_base(%rip), %rax
        movq    %rax, %cr3
 
@@ -196,7 +211,7 @@ ENTRY(secondary_startup_64)
        movq    %rax, %cr0
 
        /* Setup a boot time stack */
-       movq stack_start(%rip),%rsp
+       movq stack_start(%rip), %rsp
 
        /* zero EFLAGS after setting rsp */
        pushq $0
@@ -236,15 +251,33 @@ ENTRY(secondary_startup_64)
        movl    initial_gs+4(%rip),%edx
        wrmsr   
 
-       /* esi is pointer to real mode structure with interesting info.
+       /* rsi is pointer to real mode structure with interesting info.
           pass it to C */
-       movl    %esi, %edi
+       movq    %rsi, %rdi
        
        /* Finally jump to run C code and to be on real kernel address
         * Since we are running on identity-mapped space we have to jump
         * to the full 64bit address, this is only possible as indirect
         * jump.  In addition we need to ensure %cs is set so we make this
         * a far return.
+        *
+        * Note: do not change to far jump indirect with 64bit offset.
+        *
+        * AMD does not support far jump indirect with 64bit offset.
+        * AMD64 Architecture Programmer's Manual, Volume 3: states only
+        *      JMP FAR mem16:16 FF /5 Far jump indirect,
+        *              with the target specified by a far pointer in memory.
+        *      JMP FAR mem16:32 FF /5 Far jump indirect,
+        *              with the target specified by a far pointer in memory.
+        *
+        * Intel64 does support 64bit offset.
+        * Software Developer Manual Vol 2: states:
+        *      FF /5 JMP m16:16 Jump far, absolute indirect,
+        *              address given in m16:16
+        *      FF /5 JMP m16:32 Jump far, absolute indirect,
+        *              address given in m16:32.
+        *      REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
+        *              address given in m16:64.
         */
        movq    initial_code(%rip),%rax
        pushq   $0              # fake return address to stop unwinder
@@ -270,13 +303,13 @@ ENDPROC(start_cpu0)
 
        /* SMP bootup changes these two */
        __REFDATA
-       .align  8
-       ENTRY(initial_code)
+       .balign 8
+       GLOBAL(initial_code)
        .quad   x86_64_start_kernel
-       ENTRY(initial_gs)
+       GLOBAL(initial_gs)
        .quad   INIT_PER_CPU_VAR(irq_stack_union)
 
-       ENTRY(stack_start)
+       GLOBAL(stack_start)
        .quad  init_thread_union+THREAD_SIZE-8
        .word  0
        __FINITDATA
@@ -284,7 +317,7 @@ ENDPROC(start_cpu0)
 bad_address:
        jmp bad_address
 
-       .section ".init.text","ax"
+       __INIT
        .globl early_idt_handlers
 early_idt_handlers:
        # 104(%rsp) %rflags
@@ -321,14 +354,22 @@ ENTRY(early_idt_handler)
        pushq %r11              #  0(%rsp)
 
        cmpl $__KERNEL_CS,96(%rsp)
-       jne 10f
+       jne 11f
+
+       cmpl $14,72(%rsp)       # Page fault?
+       jnz 10f
+       GET_CR2_INTO(%rdi)      # can clobber any volatile register if pv
+       call early_make_pgtable
+       andl %eax,%eax
+       jz 20f                  # All good
 
+10:
        leaq 88(%rsp),%rdi      # Pointer to %rip
        call early_fixup_exception
        andl %eax,%eax
        jnz 20f                 # Found an exception entry
 
-10:
+11:
 #ifdef CONFIG_EARLY_PRINTK
        GET_CR2_INTO(%r9)       # can clobber any volatile register if pv
        movl 80(%rsp),%r8d      # error code
@@ -350,7 +391,7 @@ ENTRY(early_idt_handler)
 1:     hlt
        jmp 1b
 
-20:    # Exception table entry found
+20:    # Exception table entry found or page table generated
        popq %r11
        popq %r10
        popq %r9
@@ -364,6 +405,8 @@ ENTRY(early_idt_handler)
        decl early_recursion_flag(%rip)
        INTERRUPT_RETURN
 
+       __INITDATA
+
        .balign 4
 early_recursion_flag:
        .long 0
@@ -374,11 +417,10 @@ early_idt_msg:
 early_idt_ripmsg:
        .asciz "RIP %s\n"
 #endif /* CONFIG_EARLY_PRINTK */
-       .previous
 
 #define NEXT_PAGE(name) \
        .balign PAGE_SIZE; \
-ENTRY(name)
+GLOBAL(name)
 
 /* Automate the creation of 1 to 1 mapping pmd entries */
 #define PMDS(START, PERM, COUNT)                       \
@@ -388,24 +430,37 @@ ENTRY(name)
        i = i + 1 ;                                     \
        .endr
 
+       __INITDATA
+NEXT_PAGE(early_level4_pgt)
+       .fill   511,8,0
+       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+
+NEXT_PAGE(early_dynamic_pgts)
+       .fill   512*EARLY_DYNAMIC_PAGE_TABLES,8,0
+
        .data
-       /*
-        * This default setting generates an ident mapping at address 0x100000
-        * and a mapping for the kernel that precisely maps virtual address
-        * 0xffffffff80000000 to physical address 0x000000. (always using
-        * 2Mbyte large pages provided by PAE mode)
-        */
+
+#ifndef CONFIG_XEN
 NEXT_PAGE(init_level4_pgt)
-       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
-       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .org    init_level4_pgt + L4_START_KERNEL*8, 0
+       .fill   512,8,0
+#else
+NEXT_PAGE(init_level4_pgt)
+       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+       .org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
+       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+       .org    init_level4_pgt + L4_START_KERNEL*8, 0
        /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
 
 NEXT_PAGE(level3_ident_pgt)
        .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .fill   511,8,0
+       .fill   511, 8, 0
+NEXT_PAGE(level2_ident_pgt)
+       /* Since I easily can, map the first 1G.
+        * Don't set NX because code runs from these pages.
+        */
+       PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+#endif
 
 NEXT_PAGE(level3_kernel_pgt)
        .fill   L3_START_KERNEL,8,0
@@ -413,21 +468,6 @@ NEXT_PAGE(level3_kernel_pgt)
        .quad   level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
        .quad   level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
 
-NEXT_PAGE(level2_fixmap_pgt)
-       .fill   506,8,0
-       .quad   level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
-       /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
-       .fill   5,8,0
-
-NEXT_PAGE(level1_fixmap_pgt)
-       .fill   512,8,0
-
-NEXT_PAGE(level2_ident_pgt)
-       /* Since I easily can, map the first 1G.
-        * Don't set NX because code runs from these pages.
-        */
-       PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
-
 NEXT_PAGE(level2_kernel_pgt)
        /*
         * 512 MB kernel mapping. We spend a full page on this pagetable
@@ -442,11 +482,16 @@ NEXT_PAGE(level2_kernel_pgt)
        PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
                KERNEL_IMAGE_SIZE/PMD_SIZE)
 
-NEXT_PAGE(level2_spare_pgt)
-       .fill   512, 8, 0
+NEXT_PAGE(level2_fixmap_pgt)
+       .fill   506,8,0
+       .quad   level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+       /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
+       .fill   5,8,0
+
+NEXT_PAGE(level1_fixmap_pgt)
+       .fill   512,8,0
 
 #undef PMDS
-#undef NEXT_PAGE
 
        .data
        .align 16
@@ -472,6 +517,5 @@ ENTRY(nmi_idt_table)
        .skip IDT_ENTRIES * 16
 
        __PAGE_ALIGNED_BSS
-       .align PAGE_SIZE
-ENTRY(empty_zero_page)
+NEXT_PAGE(empty_zero_page)
        .skip PAGE_SIZE
index b3ea9db39db6f7ee9f9dab00632f754d8a75827d..4eabc160696f510ec5ffaddc7939dd71856463d5 100644 (file)
 #include <linux/io.h>
 #include <linux/suspend.h>
 
+#include <asm/init.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/debugreg.h>
 
-static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
-                               unsigned long addr)
-{
-       pud_t *pud;
-       pmd_t *pmd;
-       struct page *page;
-       int result = -ENOMEM;
-
-       addr &= PMD_MASK;
-       pgd += pgd_index(addr);
-       if (!pgd_present(*pgd)) {
-               page = kimage_alloc_control_pages(image, 0);
-               if (!page)
-                       goto out;
-               pud = (pud_t *)page_address(page);
-               clear_page(pud);
-               set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
-       }
-       pud = pud_offset(pgd, addr);
-       if (!pud_present(*pud)) {
-               page = kimage_alloc_control_pages(image, 0);
-               if (!page)
-                       goto out;
-               pmd = (pmd_t *)page_address(page);
-               clear_page(pmd);
-               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
-       }
-       pmd = pmd_offset(pud, addr);
-       if (!pmd_present(*pmd))
-               set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
-       result = 0;
-out:
-       return result;
-}
-
-static void init_level2_page(pmd_t *level2p, unsigned long addr)
-{
-       unsigned long end_addr;
-
-       addr &= PAGE_MASK;
-       end_addr = addr + PUD_SIZE;
-       while (addr < end_addr) {
-               set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
-               addr += PMD_SIZE;
-       }
-}
-
-static int init_level3_page(struct kimage *image, pud_t *level3p,
-                               unsigned long addr, unsigned long last_addr)
-{
-       unsigned long end_addr;
-       int result;
-
-       result = 0;
-       addr &= PAGE_MASK;
-       end_addr = addr + PGDIR_SIZE;
-       while ((addr < last_addr) && (addr < end_addr)) {
-               struct page *page;
-               pmd_t *level2p;
-
-               page = kimage_alloc_control_pages(image, 0);
-               if (!page) {
-                       result = -ENOMEM;
-                       goto out;
-               }
-               level2p = (pmd_t *)page_address(page);
-               init_level2_page(level2p, addr);
-               set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
-               addr += PUD_SIZE;
-       }
-       /* clear the unused entries */
-       while (addr < end_addr) {
-               pud_clear(level3p++);
-               addr += PUD_SIZE;
-       }
-out:
-       return result;
-}
-
-
-static int init_level4_page(struct kimage *image, pgd_t *level4p,
-                               unsigned long addr, unsigned long last_addr)
-{
-       unsigned long end_addr;
-       int result;
-
-       result = 0;
-       addr &= PAGE_MASK;
-       end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
-       while ((addr < last_addr) && (addr < end_addr)) {
-               struct page *page;
-               pud_t *level3p;
-
-               page = kimage_alloc_control_pages(image, 0);
-               if (!page) {
-                       result = -ENOMEM;
-                       goto out;
-               }
-               level3p = (pud_t *)page_address(page);
-               result = init_level3_page(image, level3p, addr, last_addr);
-               if (result)
-                       goto out;
-               set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
-               addr += PGDIR_SIZE;
-       }
-       /* clear the unused entries */
-       while (addr < end_addr) {
-               pgd_clear(level4p++);
-               addr += PGDIR_SIZE;
-       }
-out:
-       return result;
-}
-
 static void free_transition_pgtable(struct kimage *image)
 {
        free_page((unsigned long)image->arch.pud);
@@ -184,22 +71,62 @@ err:
        return result;
 }
 
+static void *alloc_pgt_page(void *data)
+{
+       struct kimage *image = (struct kimage *)data;
+       struct page *page;
+       void *p = NULL;
+
+       page = kimage_alloc_control_pages(image, 0);
+       if (page) {
+               p = page_address(page);
+               clear_page(p);
+       }
+
+       return p;
+}
 
 static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 {
+       struct x86_mapping_info info = {
+               .alloc_pgt_page = alloc_pgt_page,
+               .context        = image,
+               .pmd_flag       = __PAGE_KERNEL_LARGE_EXEC,
+       };
+       unsigned long mstart, mend;
        pgd_t *level4p;
        int result;
+       int i;
+
        level4p = (pgd_t *)__va(start_pgtable);
-       result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
-       if (result)
-               return result;
+       clear_page(level4p);
+       for (i = 0; i < nr_pfn_mapped; i++) {
+               mstart = pfn_mapped[i].start << PAGE_SHIFT;
+               mend   = pfn_mapped[i].end << PAGE_SHIFT;
+
+               result = kernel_ident_mapping_init(&info,
+                                                level4p, mstart, mend);
+               if (result)
+                       return result;
+       }
+
        /*
-        * image->start may be outside 0 ~ max_pfn, for example when
-        * jump back to original kernel from kexeced kernel
+        * segments's mem ranges could be outside 0 ~ max_pfn,
+        * for example when jump back to original kernel from kexeced kernel.
+        * or first kernel is booted with user mem map, and second kernel
+        * could be loaded out of that range.
         */
-       result = init_one_level2_page(image, level4p, image->start);
-       if (result)
-               return result;
+       for (i = 0; i < image->nr_segments; i++) {
+               mstart = image->segment[i].mem;
+               mend   = mstart + image->segment[i].memsz;
+
+               result = kernel_ident_mapping_init(&info,
+                                                level4p, mstart, mend);
+
+               if (result)
+                       return result;
+       }
+
        return init_transition_pgtable(image, level4p);
 }
 
index 8354399b3aae21082d94cb6478cf29ca93e4ee84..be6e435cfc0549502f1cadda7cd3cd5d35505451 100644 (file)
 #include <asm/topology.h>
 #include <asm/apicdef.h>
 #include <asm/amd_nb.h>
-#ifdef CONFIG_X86_64
-#include <asm/numa_64.h>
-#endif
 #include <asm/mce.h>
 #include <asm/alternative.h>
 #include <asm/prom.h>
 
 /*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
+ * max_low_pfn_mapped: highest direct mapped pfn under 4GB
+ * max_pfn_mapped:     highest direct mapped pfn over 4GB
+ *
+ * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
+ * represented by pfn_mapped
  */
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
@@ -276,18 +275,7 @@ void * __init extend_brk(size_t size, size_t align)
        return ret;
 }
 
-#ifdef CONFIG_X86_64
-static void __init init_gbpages(void)
-{
-       if (direct_gbpages && cpu_has_gbpages)
-               printk(KERN_INFO "Using GB pages for direct mapping\n");
-       else
-               direct_gbpages = 0;
-}
-#else
-static inline void init_gbpages(void)
-{
-}
+#ifdef CONFIG_X86_32
 static void __init cleanup_highmap(void)
 {
 }
@@ -306,27 +294,43 @@ static void __init reserve_brk(void)
 
 #ifdef CONFIG_BLK_DEV_INITRD
 
+static u64 __init get_ramdisk_image(void)
+{
+       u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+
+       ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
+
+       return ramdisk_image;
+}
+static u64 __init get_ramdisk_size(void)
+{
+       u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+
+       ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
+
+       return ramdisk_size;
+}
+
 #define MAX_MAP_CHUNK  (NR_FIX_BTMAPS << PAGE_SHIFT)
 static void __init relocate_initrd(void)
 {
        /* Assume only end is not page aligned */
-       u64 ramdisk_image = boot_params.hdr.ramdisk_image;
-       u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+       u64 ramdisk_image = get_ramdisk_image();
+       u64 ramdisk_size  = get_ramdisk_size();
        u64 area_size     = PAGE_ALIGN(ramdisk_size);
-       u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
        u64 ramdisk_here;
        unsigned long slop, clen, mapaddr;
        char *p, *q;
 
-       /* We need to move the initrd down into lowmem */
-       ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
-                                        PAGE_SIZE);
+       /* We need to move the initrd down into directly mapped mem */
+       ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+                                                area_size, PAGE_SIZE);
 
        if (!ramdisk_here)
                panic("Cannot find place for new RAMDISK of size %lld\n",
                         ramdisk_size);
 
-       /* Note: this includes all the lowmem currently occupied by
+       /* Note: this includes all the mem currently occupied by
           the initrd, we rely on that fact to keep the data intact. */
        memblock_reserve(ramdisk_here, area_size);
        initrd_start = ramdisk_here + PAGE_OFFSET;
@@ -336,17 +340,7 @@ static void __init relocate_initrd(void)
 
        q = (char *)initrd_start;
 
-       /* Copy any lowmem portion of the initrd */
-       if (ramdisk_image < end_of_lowmem) {
-               clen = end_of_lowmem - ramdisk_image;
-               p = (char *)__va(ramdisk_image);
-               memcpy(q, p, clen);
-               q += clen;
-               ramdisk_image += clen;
-               ramdisk_size  -= clen;
-       }
-
-       /* Copy the highmem portion of the initrd */
+       /* Copy the initrd */
        while (ramdisk_size) {
                slop = ramdisk_image & ~PAGE_MASK;
                clen = ramdisk_size;
@@ -360,22 +354,35 @@ static void __init relocate_initrd(void)
                ramdisk_image += clen;
                ramdisk_size  -= clen;
        }
-       /* high pages is not converted by early_res_to_bootmem */
-       ramdisk_image = boot_params.hdr.ramdisk_image;
-       ramdisk_size  = boot_params.hdr.ramdisk_size;
+
+       ramdisk_image = get_ramdisk_image();
+       ramdisk_size  = get_ramdisk_size();
        printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
                " [mem %#010llx-%#010llx]\n",
                ramdisk_image, ramdisk_image + ramdisk_size - 1,
                ramdisk_here, ramdisk_here + ramdisk_size - 1);
 }
 
+static void __init early_reserve_initrd(void)
+{
+       /* Assume only end is not page aligned */
+       u64 ramdisk_image = get_ramdisk_image();
+       u64 ramdisk_size  = get_ramdisk_size();
+       u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+
+       if (!boot_params.hdr.type_of_loader ||
+           !ramdisk_image || !ramdisk_size)
+               return;         /* No initrd provided by bootloader */
+
+       memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
+}
 static void __init reserve_initrd(void)
 {
        /* Assume only end is not page aligned */
-       u64 ramdisk_image = boot_params.hdr.ramdisk_image;
-       u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+       u64 ramdisk_image = get_ramdisk_image();
+       u64 ramdisk_size  = get_ramdisk_size();
        u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
-       u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
+       u64 mapped_size;
 
        if (!boot_params.hdr.type_of_loader ||
            !ramdisk_image || !ramdisk_size)
@@ -383,22 +390,18 @@ static void __init reserve_initrd(void)
 
        initrd_start = 0;
 
-       if (ramdisk_size >= (end_of_lowmem>>1)) {
+       mapped_size = memblock_mem_size(max_pfn_mapped);
+       if (ramdisk_size >= (mapped_size>>1))
                panic("initrd too large to handle, "
                       "disabling initrd (%lld needed, %lld available)\n",
-                      ramdisk_size, end_of_lowmem>>1);
-       }
+                      ramdisk_size, mapped_size>>1);
 
        printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
                        ramdisk_end - 1);
 
-
-       if (ramdisk_end <= end_of_lowmem) {
-               /* All in lowmem, easy case */
-               /*
-                * don't need to reserve again, already reserved early
-                * in i386_start_kernel
-                */
+       if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
+                               PFN_DOWN(ramdisk_end))) {
+               /* All are mapped, easy case */
                initrd_start = ramdisk_image + PAGE_OFFSET;
                initrd_end = initrd_start + ramdisk_size;
                return;
@@ -409,6 +412,9 @@ static void __init reserve_initrd(void)
        memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
 }
 #else
+static void __init early_reserve_initrd(void)
+{
+}
 static void __init reserve_initrd(void)
 {
 }
@@ -419,8 +425,6 @@ static void __init parse_setup_data(void)
        struct setup_data *data;
        u64 pa_data;
 
-       if (boot_params.hdr.version < 0x0209)
-               return;
        pa_data = boot_params.hdr.setup_data;
        while (pa_data) {
                u32 data_len, map_len;
@@ -456,8 +460,6 @@ static void __init e820_reserve_setup_data(void)
        u64 pa_data;
        int found = 0;
 
-       if (boot_params.hdr.version < 0x0209)
-               return;
        pa_data = boot_params.hdr.setup_data;
        while (pa_data) {
                data = early_memremap(pa_data, sizeof(*data));
@@ -481,8 +483,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
        struct setup_data *data;
        u64 pa_data;
 
-       if (boot_params.hdr.version < 0x0209)
-               return;
        pa_data = boot_params.hdr.setup_data;
        while (pa_data) {
                data = early_memremap(pa_data, sizeof(*data));
@@ -501,17 +501,51 @@ static void __init memblock_x86_reserve_range_setup_data(void)
 /*
  * Keep the crash kernel below this limit.  On 32 bits earlier kernels
  * would limit the kernel to the low 512 MiB due to mapping restrictions.
- * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
- * limit once kexec-tools are fixed.
  */
 #ifdef CONFIG_X86_32
 # define CRASH_KERNEL_ADDR_MAX (512 << 20)
 #else
-# define CRASH_KERNEL_ADDR_MAX (896 << 20)
+# define CRASH_KERNEL_ADDR_MAX MAXMEM
 #endif
 
+static void __init reserve_crashkernel_low(void)
+{
+#ifdef CONFIG_X86_64
+       const unsigned long long alignment = 16<<20;    /* 16M */
+       unsigned long long low_base = 0, low_size = 0;
+       unsigned long total_low_mem;
+       unsigned long long base;
+       int ret;
+
+       total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
+       ret = parse_crashkernel_low(boot_command_line, total_low_mem,
+                                               &low_size, &base);
+       if (ret != 0 || low_size <= 0)
+               return;
+
+       low_base = memblock_find_in_range(low_size, (1ULL<<32),
+                                       low_size, alignment);
+
+       if (!low_base) {
+               pr_info("crashkernel low reservation failed - No suitable area found.\n");
+
+               return;
+       }
+
+       memblock_reserve(low_base, low_size);
+       pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n",
+                       (unsigned long)(low_size >> 20),
+                       (unsigned long)(low_base >> 20),
+                       (unsigned long)(total_low_mem >> 20));
+       crashk_low_res.start = low_base;
+       crashk_low_res.end   = low_base + low_size - 1;
+       insert_resource(&iomem_resource, &crashk_low_res);
+#endif
+}
+
 static void __init reserve_crashkernel(void)
 {
+       const unsigned long long alignment = 16<<20;    /* 16M */
        unsigned long long total_mem;
        unsigned long long crash_size, crash_base;
        int ret;
@@ -525,8 +559,6 @@ static void __init reserve_crashkernel(void)
 
        /* 0 means: find the address automatically */
        if (crash_base <= 0) {
-               const unsigned long long alignment = 16<<20;    /* 16M */
-
                /*
                 *  kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
                 */
@@ -537,6 +569,7 @@ static void __init reserve_crashkernel(void)
                        pr_info("crashkernel reservation failed - No suitable area found.\n");
                        return;
                }
+
        } else {
                unsigned long long start;
 
@@ -558,6 +591,9 @@ static void __init reserve_crashkernel(void)
        crashk_res.start = crash_base;
        crashk_res.end   = crash_base + crash_size - 1;
        insert_resource(&iomem_resource, &crashk_res);
+
+       if (crash_base >= (1ULL<<32))
+               reserve_crashkernel_low();
 }
 #else
 static void __init reserve_crashkernel(void)
@@ -711,6 +747,27 @@ static void __init trim_bios_range(void)
        sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 }
 
+/* called before trim_bios_range() to spare extra sanitize */
+static void __init e820_add_kernel_range(void)
+{
+       u64 start = __pa_symbol(_text);
+       u64 size = __pa_symbol(_end) - start;
+
+       /*
+        * Complain if .text .data and .bss are not marked as E820_RAM and
+        * attempt to fix it by adding the range. We may have a confused BIOS,
+        * or the user may have used memmap=exactmap or memmap=xxM$yyM to
+        * exclude kernel range. If we really are running on top non-RAM,
+        * we will crash later anyways.
+        */
+       if (e820_all_mapped(start, start + size, E820_RAM))
+               return;
+
+       pr_warn(".text .data .bss are not marked as E820_RAM!\n");
+       e820_remove_range(start, size, E820_RAM, 0);
+       e820_add_region(start, size, E820_RAM);
+}
+
 static int __init parse_reservelow(char *p)
 {
        unsigned long long size;
@@ -748,6 +805,17 @@ early_param("reservelow", parse_reservelow);
 
 void __init setup_arch(char **cmdline_p)
 {
+       memblock_reserve(__pa_symbol(_text),
+                        (unsigned long)__bss_stop - (unsigned long)_text);
+
+       early_reserve_initrd();
+
+       /*
+        * At this point everything still needed from the boot loader
+        * or BIOS or kernel text should be early reserved or marked not
+        * RAM in e820. All other memory is free game.
+        */
+
 #ifdef CONFIG_X86_32
        memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
        visws_early_detect();
@@ -906,6 +974,7 @@ void __init setup_arch(char **cmdline_p)
        insert_resource(&iomem_resource, &data_resource);
        insert_resource(&iomem_resource, &bss_resource);
 
+       e820_add_kernel_range();
        trim_bios_range();
 #ifdef CONFIG_X86_32
        if (ppro_with_ram_bug()) {
@@ -955,6 +1024,8 @@ void __init setup_arch(char **cmdline_p)
 
        reserve_ibft_region();
 
+       early_alloc_pgt_buf();
+
        /*
         * Need to conclude brk, before memblock_x86_fill()
         *  it could use memblock_find_in_range, could overlap with
@@ -964,7 +1035,7 @@ void __init setup_arch(char **cmdline_p)
 
        cleanup_highmap();
 
-       memblock.current_limit = get_max_mapped();
+       memblock.current_limit = ISA_END_ADDRESS;
        memblock_x86_fill();
 
        /*
@@ -981,41 +1052,21 @@ void __init setup_arch(char **cmdline_p)
        setup_bios_corruption_check();
 #endif
 
+#ifdef CONFIG_X86_32
        printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
                        (max_pfn_mapped<<PAGE_SHIFT) - 1);
+#endif
 
-       setup_real_mode();
+       reserve_real_mode();
 
        trim_platform_memory_ranges();
 
-       init_gbpages();
-
-       /* max_pfn_mapped is updated here */
-       max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
-       max_pfn_mapped = max_low_pfn_mapped;
-
-#ifdef CONFIG_X86_64
-       if (max_pfn > max_low_pfn) {
-               int i;
-               unsigned long start, end;
-               unsigned long start_pfn, end_pfn;
-
-               for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn,
-                                                        NULL) {
+       init_mem_mapping();
 
-                       end = PFN_PHYS(end_pfn);
-                       if (end <= (1UL<<32))
-                               continue;
+       early_trap_pf_init();
 
-                       start = PFN_PHYS(start_pfn);
-                       max_pfn_mapped = init_memory_mapping(
-                                               max((1UL<<32), start), end);
-               }
+       setup_real_mode();
 
-               /* can we preseve max_low_pfn ?*/
-               max_low_pfn = max_pfn;
-       }
-#endif
        memblock.current_limit = get_max_mapped();
        dma_contiguous_reserve(0);
 
index ecffca11f4e92cb1df5bbf4bcedb0dcefb86bc8a..68bda7a841597ee1fc06e15b96865d1eff2633ce 100644 (file)
@@ -688,10 +688,19 @@ void __init early_trap_init(void)
        set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
        /* int3 can be called from all */
        set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+#ifdef CONFIG_X86_32
        set_intr_gate(X86_TRAP_PF, &page_fault);
+#endif
        load_idt(&idt_descr);
 }
 
+void __init early_trap_pf_init(void)
+{
+#ifdef CONFIG_X86_64
+       set_intr_gate(X86_TRAP_PF, &page_fault);
+#endif
+}
+
 void __init trap_init(void)
 {
        int i;
index 7a3d075a814a9c83a603df93b5961f247049e624..50cf83ecd32e29e75f7742b23c703ac9da0e256d 100644 (file)
@@ -62,10 +62,6 @@ struct x86_init_ops x86_init __initdata = {
                .banner                 = default_banner,
        },
 
-       .mapping = {
-               .pagetable_reserve              = native_pagetable_reserve,
-       },
-
        .paging = {
                .pagetable_init         = native_pagetable_init,
        },
index d7aea41563b372437eb227a499259be23d755564..d41815265a0b250d35e1d03834d14439faa4e679 100644 (file)
 #include <asm/proto.h>
 #include <asm/dma.h>           /* for MAX_DMA_PFN */
 
-unsigned long __initdata pgt_buf_start;
-unsigned long __meminitdata pgt_buf_end;
-unsigned long __meminitdata pgt_buf_top;
+#include "mm_internal.h"
 
-int after_bootmem;
+static unsigned long __initdata pgt_buf_start;
+static unsigned long __initdata pgt_buf_end;
+static unsigned long __initdata pgt_buf_top;
 
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
-                               = 1
-#endif
-;
+static unsigned long min_pfn_mapped;
 
-struct map_range {
-       unsigned long start;
-       unsigned long end;
-       unsigned page_size_mask;
-};
+static bool __initdata can_use_brk_pgt = true;
 
 /*
- * First calculate space needed for kernel direct mapping page tables to cover
- * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB
- * pages. Then find enough contiguous space for those page tables.
+ * Pages returned are already directly mapped.
+ *
+ * Changing that is likely to break Xen, see commit:
+ *
+ *    279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
+ *
+ * for detailed information.
  */
-static void __init find_early_table_space(struct map_range *mr, int nr_range)
+__ref void *alloc_low_pages(unsigned int num)
 {
+       unsigned long pfn;
        int i;
-       unsigned long puds = 0, pmds = 0, ptes = 0, tables;
-       unsigned long start = 0, good_end;
-       phys_addr_t base;
 
-       for (i = 0; i < nr_range; i++) {
-               unsigned long range, extra;
+       if (after_bootmem) {
+               unsigned int order;
 
-               range = mr[i].end - mr[i].start;
-               puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
+               order = get_order((unsigned long)num << PAGE_SHIFT);
+               return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
+                                               __GFP_ZERO, order);
+       }
 
-               if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
-                       extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
-                       pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
-               } else {
-                       pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT;
-               }
+       if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
+               unsigned long ret;
+               if (min_pfn_mapped >= max_pfn_mapped)
+                       panic("alloc_low_page: ran out of memory");
+               ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+                                       max_pfn_mapped << PAGE_SHIFT,
+                                       PAGE_SIZE * num , PAGE_SIZE);
+               if (!ret)
+                       panic("alloc_low_page: can not alloc memory");
+               memblock_reserve(ret, PAGE_SIZE * num);
+               pfn = ret >> PAGE_SHIFT;
+       } else {
+               pfn = pgt_buf_end;
+               pgt_buf_end += num;
+               printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
+                       pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
+       }
 
-               if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) {
-                       extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT);
-#ifdef CONFIG_X86_32
-                       extra += PMD_SIZE;
-#endif
-                       ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-               } else {
-                       ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
-               }
+       for (i = 0; i < num; i++) {
+               void *adr;
+
+               adr = __va((pfn + i) << PAGE_SHIFT);
+               clear_page(adr);
        }
 
-       tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-       tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-       tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+       return __va(pfn << PAGE_SHIFT);
+}
 
-#ifdef CONFIG_X86_32
-       /* for fixmap */
-       tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
-       good_end = max_pfn_mapped << PAGE_SHIFT;
+/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
+#define INIT_PGT_BUF_SIZE      (5 * PAGE_SIZE)
+RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
+void  __init early_alloc_pgt_buf(void)
+{
+       unsigned long tables = INIT_PGT_BUF_SIZE;
+       phys_addr_t base;
 
-       base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
-       if (!base)
-               panic("Cannot find space for the kernel page tables");
+       base = __pa(extend_brk(tables, PAGE_SIZE));
 
        pgt_buf_start = base >> PAGE_SHIFT;
        pgt_buf_end = pgt_buf_start;
        pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
+}
+
+int after_bootmem;
+
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+                               = 1
+#endif
+;
 
-       printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n",
-               mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT,
-               (pgt_buf_top << PAGE_SHIFT) - 1);
+static void __init init_gbpages(void)
+{
+#ifdef CONFIG_X86_64
+       if (direct_gbpages && cpu_has_gbpages)
+               printk(KERN_INFO "Using GB pages for direct mapping\n");
+       else
+               direct_gbpages = 0;
+#endif
 }
 
-void __init native_pagetable_reserve(u64 start, u64 end)
+struct map_range {
+       unsigned long start;
+       unsigned long end;
+       unsigned page_size_mask;
+};
+
+static int page_size_mask;
+
+static void __init probe_page_size_mask(void)
 {
-       memblock_reserve(start, end - start);
+       init_gbpages();
+
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+       /*
+        * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+        * This will simplify cpa(), which otherwise needs to support splitting
+        * large pages into small in interrupt context, etc.
+        */
+       if (direct_gbpages)
+               page_size_mask |= 1 << PG_LEVEL_1G;
+       if (cpu_has_pse)
+               page_size_mask |= 1 << PG_LEVEL_2M;
+#endif
+
+       /* Enable PSE if available */
+       if (cpu_has_pse)
+               set_in_cr4(X86_CR4_PSE);
+
+       /* Enable PGE if available */
+       if (cpu_has_pge) {
+               set_in_cr4(X86_CR4_PGE);
+               __supported_pte_mask |= _PAGE_GLOBAL;
+       }
 }
 
 #ifdef CONFIG_X86_32
@@ -122,58 +168,51 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
 }
 
 /*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
+ * adjust the page_size_mask for small range to go with
+ *     big page size instead small one if nearby are ram too.
  */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-                                              unsigned long end)
+static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
+                                                        int nr_range)
 {
-       unsigned long page_size_mask = 0;
-       unsigned long start_pfn, end_pfn;
-       unsigned long ret = 0;
-       unsigned long pos;
-
-       struct map_range mr[NR_RANGE_MR];
-       int nr_range, i;
-       int use_pse, use_gbpages;
+       int i;
 
-       printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n",
-              start, end - 1);
+       for (i = 0; i < nr_range; i++) {
+               if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
+                   !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
+                       unsigned long start = round_down(mr[i].start, PMD_SIZE);
+                       unsigned long end = round_up(mr[i].end, PMD_SIZE);
 
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
-       /*
-        * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-        * This will simplify cpa(), which otherwise needs to support splitting
-        * large pages into small in interrupt context, etc.
-        */
-       use_pse = use_gbpages = 0;
-#else
-       use_pse = cpu_has_pse;
-       use_gbpages = direct_gbpages;
+#ifdef CONFIG_X86_32
+                       if ((end >> PAGE_SHIFT) > max_low_pfn)
+                               continue;
 #endif
 
-       /* Enable PSE if available */
-       if (cpu_has_pse)
-               set_in_cr4(X86_CR4_PSE);
+                       if (memblock_is_region_memory(start, end - start))
+                               mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
+               }
+               if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
+                   !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
+                       unsigned long start = round_down(mr[i].start, PUD_SIZE);
+                       unsigned long end = round_up(mr[i].end, PUD_SIZE);
 
-       /* Enable PGE if available */
-       if (cpu_has_pge) {
-               set_in_cr4(X86_CR4_PGE);
-               __supported_pte_mask |= _PAGE_GLOBAL;
+                       if (memblock_is_region_memory(start, end - start))
+                               mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
+               }
        }
+}
 
-       if (use_gbpages)
-               page_size_mask |= 1 << PG_LEVEL_1G;
-       if (use_pse)
-               page_size_mask |= 1 << PG_LEVEL_2M;
+static int __meminit split_mem_range(struct map_range *mr, int nr_range,
+                                    unsigned long start,
+                                    unsigned long end)
+{
+       unsigned long start_pfn, end_pfn, limit_pfn;
+       unsigned long pfn;
+       int i;
 
-       memset(mr, 0, sizeof(mr));
-       nr_range = 0;
+       limit_pfn = PFN_DOWN(end);
 
        /* head if not big page alignment ? */
-       start_pfn = start >> PAGE_SHIFT;
-       pos = start_pfn << PAGE_SHIFT;
+       pfn = start_pfn = PFN_DOWN(start);
 #ifdef CONFIG_X86_32
        /*
         * Don't use a large page for the first 2/4MB of memory
@@ -181,66 +220,60 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
         * and overlapping MTRRs into large pages can cause
         * slowdowns.
         */
-       if (pos == 0)
-               end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+       if (pfn == 0)
+               end_pfn = PFN_DOWN(PMD_SIZE);
        else
-               end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-                                << (PMD_SHIFT - PAGE_SHIFT);
+               end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
 #else /* CONFIG_X86_64 */
-       end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
-                       << (PMD_SHIFT - PAGE_SHIFT);
+       end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
 #endif
-       if (end_pfn > (end >> PAGE_SHIFT))
-               end_pfn = end >> PAGE_SHIFT;
+       if (end_pfn > limit_pfn)
+               end_pfn = limit_pfn;
        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-               pos = end_pfn << PAGE_SHIFT;
+               pfn = end_pfn;
        }
 
        /* big page (2M) range */
-       start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-                        << (PMD_SHIFT - PAGE_SHIFT);
+       start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
 #ifdef CONFIG_X86_32
-       end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+       end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
 #else /* CONFIG_X86_64 */
-       end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-                        << (PUD_SHIFT - PAGE_SHIFT);
-       if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
-               end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+       end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+       if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
+               end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
 #endif
 
        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
                                page_size_mask & (1<<PG_LEVEL_2M));
-               pos = end_pfn << PAGE_SHIFT;
+               pfn = end_pfn;
        }
 
 #ifdef CONFIG_X86_64
        /* big page (1G) range */
-       start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-                        << (PUD_SHIFT - PAGE_SHIFT);
-       end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+       start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+       end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
                                page_size_mask &
                                 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
-               pos = end_pfn << PAGE_SHIFT;
+               pfn = end_pfn;
        }
 
        /* tail is not big page (1G) alignment */
-       start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-                        << (PMD_SHIFT - PAGE_SHIFT);
-       end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+       start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+       end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
        if (start_pfn < end_pfn) {
                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
                                page_size_mask & (1<<PG_LEVEL_2M));
-               pos = end_pfn << PAGE_SHIFT;
+               pfn = end_pfn;
        }
 #endif
 
        /* tail is not big page (2M) alignment */
-       start_pfn = pos>>PAGE_SHIFT;
-       end_pfn = end>>PAGE_SHIFT;
+       start_pfn = pfn;
+       end_pfn = limit_pfn;
        nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
 
        /* try to merge same page size and continuous */
@@ -257,59 +290,169 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
                nr_range--;
        }
 
+       if (!after_bootmem)
+               adjust_range_page_size_mask(mr, nr_range);
+
        for (i = 0; i < nr_range; i++)
                printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
                                mr[i].start, mr[i].end - 1,
                        (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
                         (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
 
-       /*
-        * Find space for the kernel direct mapping tables.
-        *
-        * Later we should allocate these tables in the local node of the
-        * memory mapped. Unfortunately this is done currently before the
-        * nodes are discovered.
-        */
-       if (!after_bootmem)
-               find_early_table_space(mr, nr_range);
+       return nr_range;
+}
+
+struct range pfn_mapped[E820_X_MAX];
+int nr_pfn_mapped;
+
+static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+       nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
+                                            nr_pfn_mapped, start_pfn, end_pfn);
+       nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
+
+       max_pfn_mapped = max(max_pfn_mapped, end_pfn);
+
+       if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
+               max_low_pfn_mapped = max(max_low_pfn_mapped,
+                                        min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
+}
+
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+       int i;
+
+       for (i = 0; i < nr_pfn_mapped; i++)
+               if ((start_pfn >= pfn_mapped[i].start) &&
+                   (end_pfn <= pfn_mapped[i].end))
+                       return true;
+
+       return false;
+}
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+                                              unsigned long end)
+{
+       struct map_range mr[NR_RANGE_MR];
+       unsigned long ret = 0;
+       int nr_range, i;
+
+       pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
+              start, end - 1);
+
+       memset(mr, 0, sizeof(mr));
+       nr_range = split_mem_range(mr, 0, start, end);
 
        for (i = 0; i < nr_range; i++)
                ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
                                                   mr[i].page_size_mask);
 
-#ifdef CONFIG_X86_32
-       early_ioremap_page_table_range_init();
+       add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
 
-       load_cr3(swapper_pg_dir);
-#endif
+       return ret >> PAGE_SHIFT;
+}
 
-       __flush_tlb_all();
+/*
+ * would have hole in the middle or ends, and only ram parts will be mapped.
+ */
+static unsigned long __init init_range_memory_mapping(
+                                          unsigned long r_start,
+                                          unsigned long r_end)
+{
+       unsigned long start_pfn, end_pfn;
+       unsigned long mapped_ram_size = 0;
+       int i;
 
-       /*
-        * Reserve the kernel pagetable pages we used (pgt_buf_start -
-        * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
-        * so that they can be reused for other purposes.
-        *
-        * On native it just means calling memblock_reserve, on Xen it also
-        * means marking RW the pagetable pages that we allocated before
-        * but that haven't been used.
-        *
-        * In fact on xen we mark RO the whole range pgt_buf_start -
-        * pgt_buf_top, because we have to make sure that when
-        * init_memory_mapping reaches the pagetable pages area, it maps
-        * RO all the pagetable pages, including the ones that are beyond
-        * pgt_buf_end at that time.
-        */
-       if (!after_bootmem && pgt_buf_end > pgt_buf_start)
-               x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
-                               PFN_PHYS(pgt_buf_end));
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+               u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
+               u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
+               if (start >= end)
+                       continue;
 
-       if (!after_bootmem)
-               early_memtest(start, end);
+               /*
+                * if it is overlapping with brk pgt, we need to
+                * alloc pgt buf from memblock instead.
+                */
+               can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
+                                   min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
+               init_memory_mapping(start, end);
+               mapped_ram_size += end - start;
+               can_use_brk_pgt = true;
+       }
 
-       return ret >> PAGE_SHIFT;
+       return mapped_ram_size;
 }
 
+/* (PUD_SHIFT-PMD_SHIFT)/2 */
+#define STEP_SIZE_SHIFT 5
+void __init init_mem_mapping(void)
+{
+       unsigned long end, real_end, start, last_start;
+       unsigned long step_size;
+       unsigned long addr;
+       unsigned long mapped_ram_size = 0;
+       unsigned long new_mapped_ram_size;
+
+       probe_page_size_mask();
+
+#ifdef CONFIG_X86_64
+       end = max_pfn << PAGE_SHIFT;
+#else
+       end = max_low_pfn << PAGE_SHIFT;
+#endif
+
+       /* the ISA range is always mapped regardless of memory holes */
+       init_memory_mapping(0, ISA_END_ADDRESS);
+
+       /* xen has big range in reserved near end of ram, skip it at first */
+       addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE,
+                        PAGE_SIZE);
+       real_end = addr + PMD_SIZE;
+
+       /* step_size need to be small so pgt_buf from BRK could cover it */
+       step_size = PMD_SIZE;
+       max_pfn_mapped = 0; /* will get exact value next */
+       min_pfn_mapped = real_end >> PAGE_SHIFT;
+       last_start = start = real_end;
+       while (last_start > ISA_END_ADDRESS) {
+               if (last_start > step_size) {
+                       start = round_down(last_start - 1, step_size);
+                       if (start < ISA_END_ADDRESS)
+                               start = ISA_END_ADDRESS;
+               } else
+                       start = ISA_END_ADDRESS;
+               new_mapped_ram_size = init_range_memory_mapping(start,
+                                                       last_start);
+               last_start = start;
+               min_pfn_mapped = last_start >> PAGE_SHIFT;
+               /* only increase step_size after big range get mapped */
+               if (new_mapped_ram_size > mapped_ram_size)
+                       step_size <<= STEP_SIZE_SHIFT;
+               mapped_ram_size += new_mapped_ram_size;
+       }
+
+       if (real_end < end)
+               init_range_memory_mapping(real_end, end);
+
+#ifdef CONFIG_X86_64
+       if (max_pfn > max_low_pfn) {
+               /* can we preseve max_low_pfn ?*/
+               max_low_pfn = max_pfn;
+       }
+#else
+       early_ioremap_page_table_range_init();
+#endif
+
+       load_cr3(swapper_pg_dir);
+       __flush_tlb_all();
+
+       early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
+}
 
 /*
  * devmem_is_allowed() checks to see if /dev/mem access to a certain address
index 745d66b843c84241f1d849e865dc2796a00b6f1f..b299724f6e34e9ace068a4c7f39d504a7cc5c7b4 100644 (file)
 #include <asm/page_types.h>
 #include <asm/init.h>
 
+#include "mm_internal.h"
+
 unsigned long highstart_pfn, highend_pfn;
 
 static noinline int do_test_wp_bit(void);
 
 bool __read_mostly __vmalloc_start_set = false;
 
-static __init void *alloc_low_page(void)
-{
-       unsigned long pfn = pgt_buf_end++;
-       void *adr;
-
-       if (pfn >= pgt_buf_top)
-               panic("alloc_low_page: ran out of memory");
-
-       adr = __va(pfn * PAGE_SIZE);
-       clear_page(adr);
-       return adr;
-}
-
 /*
  * Creates a middle page table and puts a pointer to it in the
  * given global directory entry. This only returns the gd entry
@@ -84,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 
 #ifdef CONFIG_X86_PAE
        if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
-               if (after_bootmem)
-                       pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
-               else
-                       pmd_table = (pmd_t *)alloc_low_page();
+               pmd_table = (pmd_t *)alloc_low_page();
                paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
                set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
                pud = pud_offset(pgd, 0);
@@ -109,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 static pte_t * __init one_page_table_init(pmd_t *pmd)
 {
        if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
-               pte_t *page_table = NULL;
-
-               if (after_bootmem) {
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
-                       page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
-#endif
-                       if (!page_table)
-                               page_table =
-                               (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
-               } else
-                       page_table = (pte_t *)alloc_low_page();
+               pte_t *page_table = (pte_t *)alloc_low_page();
 
                paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
                set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -146,8 +122,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr)
        return one_page_table_init(pmd) + pte_idx;
 }
 
+static unsigned long __init
+page_table_range_init_count(unsigned long start, unsigned long end)
+{
+       unsigned long count = 0;
+#ifdef CONFIG_HIGHMEM
+       int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+       int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+       int pgd_idx, pmd_idx;
+       unsigned long vaddr;
+
+       if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
+               return 0;
+
+       vaddr = start;
+       pgd_idx = pgd_index(vaddr);
+
+       for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
+               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+                                                       pmd_idx++) {
+                       if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
+                           (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
+                               count++;
+                       vaddr += PMD_SIZE;
+               }
+               pmd_idx = 0;
+       }
+#endif
+       return count;
+}
+
 static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
-                                          unsigned long vaddr, pte_t *lastpte)
+                                          unsigned long vaddr, pte_t *lastpte,
+                                          void **adr)
 {
 #ifdef CONFIG_HIGHMEM
        /*
@@ -161,16 +168,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
 
        if (pmd_idx_kmap_begin != pmd_idx_kmap_end
            && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
-           && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
-           && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
-               || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
+           && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
                pte_t *newpte;
                int i;
 
                BUG_ON(after_bootmem);
-               newpte = alloc_low_page();
+               newpte = *adr;
                for (i = 0; i < PTRS_PER_PTE; i++)
                        set_pte(newpte + i, pte[i]);
+               *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
 
                paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
                set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
@@ -204,6 +210,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
        pgd_t *pgd;
        pmd_t *pmd;
        pte_t *pte = NULL;
+       unsigned long count = page_table_range_init_count(start, end);
+       void *adr = NULL;
+
+       if (count)
+               adr = alloc_low_pages(count);
 
        vaddr = start;
        pgd_idx = pgd_index(vaddr);
@@ -216,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
                for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
                                                        pmd++, pmd_idx++) {
                        pte = page_table_kmap_check(one_page_table_init(pmd),
-                                                   pmd, vaddr, pte);
+                                                   pmd, vaddr, pte, &adr);
 
                        vaddr += PMD_SIZE;
                }
@@ -310,6 +321,7 @@ repeat:
                                        __pgprot(PTE_IDENT_ATTR |
                                                 _PAGE_PSE);
 
+                               pfn &= PMD_MASK >> PAGE_SHIFT;
                                addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
                                        PAGE_OFFSET + PAGE_SIZE-1;
 
@@ -455,9 +467,14 @@ void __init native_pagetable_init(void)
 
        /*
         * Remove any mappings which extend past the end of physical
-        * memory from the boot time page table:
+        * memory from the boot time page table.
+        * In virtual address space, we should have at least two pages
+        * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
+        * definition. And max_low_pfn is set to VMALLOC_END physical
+        * address. If initial memory mapping is doing right job, we
+        * should have pte used near max_low_pfn or one pmd is not present.
         */
-       for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
+       for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
                va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
                pgd = base + pgd_index(va);
                if (!pgd_present(*pgd))
@@ -468,10 +485,19 @@ void __init native_pagetable_init(void)
                if (!pmd_present(*pmd))
                        break;
 
+               /* should not be large page here */
+               if (pmd_large(*pmd)) {
+                       pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
+                               pfn, pmd, __pa(pmd));
+                       BUG_ON(1);
+               }
+
                pte = pte_offset_kernel(pmd, va);
                if (!pte_present(*pte))
                        break;
 
+               printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n",
+                               pfn, pmd, __pa(pmd), pte, __pa(pte));
                pte_clear(NULL, va, pte);
        }
        paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
@@ -550,7 +576,7 @@ early_param("highmem", parse_highmem);
  * artificially via the highmem=x boot parameter then create
  * it:
  */
-void __init lowmem_pfn_init(void)
+static void __init lowmem_pfn_init(void)
 {
        /* max_low_pfn is 0, we already have early_res support */
        max_low_pfn = max_pfn;
@@ -586,7 +612,7 @@ void __init lowmem_pfn_init(void)
  * We have more RAM than fits into lowmem - we try to put it into
  * highmem, also taking the highmem=x boot parameter into account:
  */
-void __init highmem_pfn_init(void)
+static void __init highmem_pfn_init(void)
 {
        max_low_pfn = MAXMEM_PFN;
 
@@ -669,8 +695,6 @@ void __init setup_bootmem_allocator(void)
        printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
                 max_pfn_mapped<<PAGE_SHIFT);
        printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
-
-       after_bootmem = 1;
 }
 
 /*
@@ -753,6 +777,8 @@ void __init mem_init(void)
                if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
                        reservedpages++;
 
+       after_bootmem = 1;
+
        codesize =  (unsigned long) &_etext - (unsigned long) &_text;
        datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
        initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
index 287c6d6a9ef1ff1ba140801f875443e092c8c550..edaa2daf4b3729ad3da4b561786ae72964b128b8 100644 (file)
 #include <asm/uv/uv.h>
 #include <asm/setup.h>
 
+#include "mm_internal.h"
+
+static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
+                          unsigned long addr, unsigned long end)
+{
+       addr &= PMD_MASK;
+       for (; addr < end; addr += PMD_SIZE) {
+               pmd_t *pmd = pmd_page + pmd_index(addr);
+
+               if (!pmd_present(*pmd))
+                       set_pmd(pmd, __pmd(addr | pmd_flag));
+       }
+}
+static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
+                         unsigned long addr, unsigned long end)
+{
+       unsigned long next;
+
+       for (; addr < end; addr = next) {
+               pud_t *pud = pud_page + pud_index(addr);
+               pmd_t *pmd;
+
+               next = (addr & PUD_MASK) + PUD_SIZE;
+               if (next > end)
+                       next = end;
+
+               if (pud_present(*pud)) {
+                       pmd = pmd_offset(pud, 0);
+                       ident_pmd_init(info->pmd_flag, pmd, addr, next);
+                       continue;
+               }
+               pmd = (pmd_t *)info->alloc_pgt_page(info->context);
+               if (!pmd)
+                       return -ENOMEM;
+               ident_pmd_init(info->pmd_flag, pmd, addr, next);
+               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+       }
+
+       return 0;
+}
+
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+                             unsigned long addr, unsigned long end)
+{
+       unsigned long next;
+       int result;
+       int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
+
+       for (; addr < end; addr = next) {
+               pgd_t *pgd = pgd_page + pgd_index(addr) + off;
+               pud_t *pud;
+
+               next = (addr & PGDIR_MASK) + PGDIR_SIZE;
+               if (next > end)
+                       next = end;
+
+               if (pgd_present(*pgd)) {
+                       pud = pud_offset(pgd, 0);
+                       result = ident_pud_init(info, pud, addr, next);
+                       if (result)
+                               return result;
+                       continue;
+               }
+
+               pud = (pud_t *)info->alloc_pgt_page(info->context);
+               if (!pud)
+                       return -ENOMEM;
+               result = ident_pud_init(info, pud, addr, next);
+               if (result)
+                       return result;
+               set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+       }
+
+       return 0;
+}
+
 static int __init parse_direct_gbpages_off(char *arg)
 {
        direct_gbpages = 0;
@@ -302,10 +378,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
 void __init cleanup_highmap(void)
 {
        unsigned long vaddr = __START_KERNEL_map;
-       unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+       unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
        unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
        pmd_t *pmd = level2_kernel_pgt;
 
+       /*
+        * Native path, max_pfn_mapped is not set yet.
+        * Xen has valid max_pfn_mapped set in
+        *      arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
+        */
+       if (max_pfn_mapped)
+               vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+
        for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
                if (pmd_none(*pmd))
                        continue;
@@ -314,69 +398,24 @@ void __init cleanup_highmap(void)
        }
 }
 
-static __ref void *alloc_low_page(unsigned long *phys)
-{
-       unsigned long pfn = pgt_buf_end++;
-       void *adr;
-
-       if (after_bootmem) {
-               adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
-               *phys = __pa(adr);
-
-               return adr;
-       }
-
-       if (pfn >= pgt_buf_top)
-               panic("alloc_low_page: ran out of memory");
-
-       adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
-       clear_page(adr);
-       *phys  = pfn * PAGE_SIZE;
-       return adr;
-}
-
-static __ref void *map_low_page(void *virt)
-{
-       void *adr;
-       unsigned long phys, left;
-
-       if (after_bootmem)
-               return virt;
-
-       phys = __pa(virt);
-       left = phys & (PAGE_SIZE - 1);
-       adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
-       adr = (void *)(((unsigned long)adr) | left);
-
-       return adr;
-}
-
-static __ref void unmap_low_page(void *adr)
-{
-       if (after_bootmem)
-               return;
-
-       early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
-}
-
 static unsigned long __meminit
 phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
              pgprot_t prot)
 {
-       unsigned pages = 0;
+       unsigned long pages = 0, next;
        unsigned long last_map_addr = end;
        int i;
 
        pte_t *pte = pte_page + pte_index(addr);
 
-       for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
-
+       for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
+               next = (addr & PAGE_MASK) + PAGE_SIZE;
                if (addr >= end) {
-                       if (!after_bootmem) {
-                               for(; i < PTRS_PER_PTE; i++, pte++)
-                                       set_pte(pte, __pte(0));
-                       }
-                       break;
+                       if (!after_bootmem &&
+                           !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
+                           !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
+                               set_pte(pte, __pte(0));
+                       continue;
                }
 
                /*
@@ -414,28 +453,25 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
        int i = pmd_index(address);
 
        for (; i < PTRS_PER_PMD; i++, address = next) {
-               unsigned long pte_phys;
                pmd_t *pmd = pmd_page + pmd_index(address);
                pte_t *pte;
                pgprot_t new_prot = prot;
 
+               next = (address & PMD_MASK) + PMD_SIZE;
                if (address >= end) {
-                       if (!after_bootmem) {
-                               for (; i < PTRS_PER_PMD; i++, pmd++)
-                                       set_pmd(pmd, __pmd(0));
-                       }
-                       break;
+                       if (!after_bootmem &&
+                           !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
+                           !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
+                               set_pmd(pmd, __pmd(0));
+                       continue;
                }
 
-               next = (address & PMD_MASK) + PMD_SIZE;
-
                if (pmd_val(*pmd)) {
                        if (!pmd_large(*pmd)) {
                                spin_lock(&init_mm.page_table_lock);
-                               pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+                               pte = (pte_t *)pmd_page_vaddr(*pmd);
                                last_map_addr = phys_pte_init(pte, address,
                                                                end, prot);
-                               unmap_low_page(pte);
                                spin_unlock(&init_mm.page_table_lock);
                                continue;
                        }
@@ -464,19 +500,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
                        pages++;
                        spin_lock(&init_mm.page_table_lock);
                        set_pte((pte_t *)pmd,
-                               pfn_pte(address >> PAGE_SHIFT,
+                               pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
                                        __pgprot(pgprot_val(prot) | _PAGE_PSE)));
                        spin_unlock(&init_mm.page_table_lock);
                        last_map_addr = next;
                        continue;
                }
 
-               pte = alloc_low_page(&pte_phys);
+               pte = alloc_low_page();
                last_map_addr = phys_pte_init(pte, address, end, new_prot);
-               unmap_low_page(pte);
 
                spin_lock(&init_mm.page_table_lock);
-               pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+               pmd_populate_kernel(&init_mm, pmd, pte);
                spin_unlock(&init_mm.page_table_lock);
        }
        update_page_count(PG_LEVEL_2M, pages);
@@ -492,27 +527,24 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
        int i = pud_index(addr);
 
        for (; i < PTRS_PER_PUD; i++, addr = next) {
-               unsigned long pmd_phys;
                pud_t *pud = pud_page + pud_index(addr);
                pmd_t *pmd;
                pgprot_t prot = PAGE_KERNEL;
 
-               if (addr >= end)
-                       break;
-
                next = (addr & PUD_MASK) + PUD_SIZE;
-
-               if (!after_bootmem && !e820_any_mapped(addr, next, 0)) {
-                       set_pud(pud, __pud(0));
+               if (addr >= end) {
+                       if (!after_bootmem &&
+                           !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
+                           !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
+                               set_pud(pud, __pud(0));
                        continue;
                }
 
                if (pud_val(*pud)) {
                        if (!pud_large(*pud)) {
-                               pmd = map_low_page(pmd_offset(pud, 0));
+                               pmd = pmd_offset(pud, 0);
                                last_map_addr = phys_pmd_init(pmd, addr, end,
                                                         page_size_mask, prot);
-                               unmap_low_page(pmd);
                                __flush_tlb_all();
                                continue;
                        }
@@ -541,19 +573,19 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
                        pages++;
                        spin_lock(&init_mm.page_table_lock);
                        set_pte((pte_t *)pud,
-                               pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+                               pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
+                                       PAGE_KERNEL_LARGE));
                        spin_unlock(&init_mm.page_table_lock);
                        last_map_addr = next;
                        continue;
                }
 
-               pmd = alloc_low_page(&pmd_phys);
+               pmd = alloc_low_page();
                last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
                                              prot);
-               unmap_low_page(pmd);
 
                spin_lock(&init_mm.page_table_lock);
-               pud_populate(&init_mm, pud, __va(pmd_phys));
+               pud_populate(&init_mm, pud, pmd);
                spin_unlock(&init_mm.page_table_lock);
        }
        __flush_tlb_all();
@@ -578,28 +610,23 @@ kernel_physical_mapping_init(unsigned long start,
 
        for (; start < end; start = next) {
                pgd_t *pgd = pgd_offset_k(start);
-               unsigned long pud_phys;
                pud_t *pud;
 
-               next = (start + PGDIR_SIZE) & PGDIR_MASK;
-               if (next > end)
-                       next = end;
+               next = (start & PGDIR_MASK) + PGDIR_SIZE;
 
                if (pgd_val(*pgd)) {
-                       pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+                       pud = (pud_t *)pgd_page_vaddr(*pgd);
                        last_map_addr = phys_pud_init(pud, __pa(start),
                                                 __pa(end), page_size_mask);
-                       unmap_low_page(pud);
                        continue;
                }
 
-               pud = alloc_low_page(&pud_phys);
-               last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
+               pud = alloc_low_page();
+               last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),
                                                 page_size_mask);
-               unmap_low_page(pud);
 
                spin_lock(&init_mm.page_table_lock);
-               pgd_populate(&init_mm, pgd, __va(pud_phys));
+               pgd_populate(&init_mm, pgd, pud);
                spin_unlock(&init_mm.page_table_lock);
                pgd_changed = true;
        }
@@ -664,13 +691,11 @@ int arch_add_memory(int nid, u64 start, u64 size)
 {
        struct pglist_data *pgdat = NODE_DATA(nid);
        struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
-       unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
+       unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;
        int ret;
 
-       last_mapped_pfn = init_memory_mapping(start, start + size);
-       if (last_mapped_pfn > max_pfn_mapped)
-               max_pfn_mapped = last_mapped_pfn;
+       init_memory_mapping(start, start + size);
 
        ret = __add_pages(nid, zone, start_pfn, nr_pages);
        WARN_ON_ONCE(ret);
@@ -686,6 +711,16 @@ EXPORT_SYMBOL_GPL(arch_add_memory);
 
 static struct kcore_list kcore_vsyscall;
 
+static void __init register_page_bootmem_info(void)
+{
+#ifdef CONFIG_NUMA
+       int i;
+
+       for_each_online_node(i)
+               register_page_bootmem_info_node(NODE_DATA(i));
+#endif
+}
+
 void __init mem_init(void)
 {
        long codesize, reservedpages, datasize, initsize;
@@ -698,11 +733,8 @@ void __init mem_init(void)
        reservedpages = 0;
 
        /* this will put all low memory onto the freelists */
-#ifdef CONFIG_NUMA
-       totalram_pages = numa_free_all_bootmem();
-#else
+       register_page_bootmem_info();
        totalram_pages = free_all_bootmem();
-#endif
 
        absent_pages = absent_pages_in_range(0, max_pfn);
        reservedpages = max_pfn - totalram_pages - absent_pages;
@@ -776,6 +808,7 @@ void mark_rodata_ro(void)
        unsigned long end = (unsigned long) &__end_rodata_hpage_align;
        unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
        unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
+       unsigned long all_end = PFN_ALIGN(&_end);
 
        printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
               (end - start) >> 10);
@@ -784,10 +817,10 @@ void mark_rodata_ro(void)
        kernel_set_to_readonly = 1;
 
        /*
-        * The rodata section (but not the kernel text!) should also be
-        * not-executable.
+        * The rodata/data/bss/brk section (but not the kernel text!)
+        * should also be not-executable.
         */
-       set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
+       set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
 
        rodata_test();
 
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
new file mode 100644 (file)
index 0000000..6b563a1
--- /dev/null
@@ -0,0 +1,19 @@
+#ifndef __X86_MM_INTERNAL_H
+#define __X86_MM_INTERNAL_H
+
+void *alloc_low_pages(unsigned int num);
+static inline void *alloc_low_page(void)
+{
+       return alloc_low_pages(1);
+}
+
+void early_ioremap_page_table_range_init(void);
+
+unsigned long kernel_physical_mapping_init(unsigned long start,
+                                            unsigned long end,
+                                            unsigned long page_size_mask);
+void zone_sizes_init(void);
+
+extern int after_bootmem;
+
+#endif /* __X86_MM_INTERNAL_H */
index 92e27119ee1a0b5db2559ff8787a3ce83c150a59..9405ffc915026a14e5658d82ab9a6c54c9a781b2 100644 (file)
@@ -10,16 +10,3 @@ void __init initmem_init(void)
 {
        x86_numa_init();
 }
-
-unsigned long __init numa_free_all_bootmem(void)
-{
-       unsigned long pages = 0;
-       int i;
-
-       for_each_online_node(i)
-               pages += free_all_bootmem_node(NODE_DATA(i));
-
-       pages += free_low_memory_core_early(MAX_NUMNODES);
-
-       return pages;
-}
index 6d13d2a3f825306a11195a258fb7f78ed073c887..a1b1c88f9caf5f6c1eaf6a8b6a95477dad36a5f8 100644 (file)
@@ -579,16 +579,10 @@ static int split_large_page(pte_t *kpte, unsigned long address)
        for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
                set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
 
-       if (address >= (unsigned long)__va(0) &&
-               address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+       if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
+                               PFN_DOWN(__pa(address)) + 1))
                split_page_count(level);
 
-#ifdef CONFIG_X86_64
-       if (address >= (unsigned long)__va(1UL<<32) &&
-               address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
-               split_page_count(level);
-#endif
-
        /*
         * Install the new, split up pagetable.
         *
@@ -757,13 +751,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
        unsigned long vaddr;
        int ret;
 
-       if (cpa->pfn >= max_pfn_mapped)
+       if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
                return 0;
 
-#ifdef CONFIG_X86_64
-       if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
-               return 0;
-#endif
        /*
         * No need to redo, when the primary call touched the direct
         * mapping already:
index 1b600266265e4892eb06004642caf3e737699396..1743c1c924119bb1d8d4d8824ad5dada74dc166e 100644 (file)
@@ -835,7 +835,7 @@ void __init efi_enter_virtual_mode(void)
        efi_memory_desc_t *md, *prev_md = NULL;
        efi_status_t status;
        unsigned long size;
-       u64 end, systab, end_pfn;
+       u64 end, systab, start_pfn, end_pfn;
        void *p, *va, *new_memmap = NULL;
        int count = 0;
 
@@ -888,10 +888,9 @@ void __init efi_enter_virtual_mode(void)
                size = md->num_pages << EFI_PAGE_SHIFT;
                end = md->phys_addr + size;
 
+               start_pfn = PFN_DOWN(md->phys_addr);
                end_pfn = PFN_UP(end);
-               if (end_pfn <= max_low_pfn_mapped
-                   || (end_pfn > (1UL << (32 - PAGE_SHIFT))
-                       && end_pfn <= max_pfn_mapped)) {
+               if (pfn_range_is_mapped(start_pfn, end_pfn)) {
                        va = __va(md->phys_addr);
 
                        if (!(md->attribute & EFI_MEMORY_WB))
index 460f314d13e54fff4be0c3cd3d9a126a2973d0c7..a0fde91c16cf779203bcb0195c6f063be9f303b3 100644 (file)
@@ -11,6 +11,8 @@
 #include <linux/gfp.h>
 #include <linux/smp.h>
 #include <linux/suspend.h>
+
+#include <asm/init.h>
 #include <asm/proto.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -39,41 +41,21 @@ pgd_t *temp_level4_pgt;
 
 void *relocated_restore_code;
 
-static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+static void *alloc_pgt_page(void *context)
 {
-       long i, j;
-
-       i = pud_index(address);
-       pud = pud + i;
-       for (; i < PTRS_PER_PUD; pud++, i++) {
-               unsigned long paddr;
-               pmd_t *pmd;
-
-               paddr = address + i*PUD_SIZE;
-               if (paddr >= end)
-                       break;
-
-               pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
-               if (!pmd)
-                       return -ENOMEM;
-               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
-               for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
-                       unsigned long pe;
-
-                       if (paddr >= end)
-                               break;
-                       pe = __PAGE_KERNEL_LARGE_EXEC | paddr;
-                       pe &= __supported_pte_mask;
-                       set_pmd(pmd, __pmd(pe));
-               }
-       }
-       return 0;
+       return (void *)get_safe_page(GFP_ATOMIC);
 }
 
 static int set_up_temporary_mappings(void)
 {
-       unsigned long start, end, next;
-       int error;
+       struct x86_mapping_info info = {
+               .alloc_pgt_page = alloc_pgt_page,
+               .pmd_flag       = __PAGE_KERNEL_LARGE_EXEC,
+               .kernel_mapping = true,
+       };
+       unsigned long mstart, mend;
+       int result;
+       int i;
 
        temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
        if (!temp_level4_pgt)
@@ -84,21 +66,17 @@ static int set_up_temporary_mappings(void)
                init_level4_pgt[pgd_index(__START_KERNEL_map)]);
 
        /* Set up the direct mapping from scratch */
-       start = (unsigned long)pfn_to_kaddr(0);
-       end = (unsigned long)pfn_to_kaddr(max_pfn);
-
-       for (; start < end; start = next) {
-               pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
-               if (!pud)
-                       return -ENOMEM;
-               next = start + PGDIR_SIZE;
-               if (next > end)
-                       next = end;
-               if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
-                       return error;
-               set_pgd(temp_level4_pgt + pgd_index(start),
-                       mk_kernel_pgd(__pa(pud)));
+       for (i = 0; i < nr_pfn_mapped; i++) {
+               mstart = pfn_mapped[i].start << PAGE_SHIFT;
+               mend   = pfn_mapped[i].end << PAGE_SHIFT;
+
+               result = kernel_ident_mapping_init(&info, temp_level4_pgt,
+                                                  mstart, mend);
+
+               if (result)
+                       return result;
        }
+
        return 0;
 }
 
index 80450261215c50b8c62b3fd559b0dd7a82f6275d..a44f457e70a19f7ffce772c5497e0090ee514db6 100644 (file)
@@ -8,9 +8,26 @@
 struct real_mode_header *real_mode_header;
 u32 *trampoline_cr4_features;
 
-void __init setup_real_mode(void)
+void __init reserve_real_mode(void)
 {
        phys_addr_t mem;
+       unsigned char *base;
+       size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
+
+       /* Has to be under 1M so we can execute real-mode AP code. */
+       mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
+       if (!mem)
+               panic("Cannot allocate trampoline\n");
+
+       base = __va(mem);
+       memblock_reserve(mem, size);
+       real_mode_header = (struct real_mode_header *) base;
+       printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
+              base, (unsigned long long)mem, size);
+}
+
+void __init setup_real_mode(void)
+{
        u16 real_mode_seg;
        u32 *rel;
        u32 count;
@@ -25,16 +42,7 @@ void __init setup_real_mode(void)
        u64 efer;
 #endif
 
-       /* Has to be in very low memory so we can execute real-mode AP code. */
-       mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
-       if (!mem)
-               panic("Cannot allocate trampoline\n");
-
-       base = __va(mem);
-       memblock_reserve(mem, size);
-       real_mode_header = (struct real_mode_header *) base;
-       printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
-              base, (unsigned long long)mem, size);
+       base = (unsigned char *)real_mode_header;
 
        memcpy(base, real_mode_blob, size);
 
@@ -78,16 +86,18 @@ void __init setup_real_mode(void)
        *trampoline_cr4_features = read_cr4();
 
        trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
-       trampoline_pgd[0] = __pa_symbol(level3_ident_pgt) + _KERNPG_TABLE;
-       trampoline_pgd[511] = __pa_symbol(level3_kernel_pgt) + _KERNPG_TABLE;
+       trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
+       trampoline_pgd[511] = init_level4_pgt[511].pgd;
 #endif
 }
 
 /*
- * set_real_mode_permissions() gets called very early, to guarantee the
- * availability of low memory.  This is before the proper kernel page
+ * reserve_real_mode() gets called very early, to guarantee the
+ * availability of low memory. This is before the proper kernel page
  * tables are set up, so we cannot set page permissions in that
- * function.  Thus, we use an arch_initcall instead.
+ * function. Also trampoline code will be executed by APs so we
+ * need to mark it executable at do_pre_smp_initcalls() at least,
+ * thus run it as a early_initcall().
  */
 static int __init set_real_mode_permissions(void)
 {
@@ -111,5 +121,4 @@ static int __init set_real_mode_permissions(void)
 
        return 0;
 }
-
-arch_initcall(set_real_mode_permissions);
+early_initcall(set_real_mode_permissions);
index 5a1847d619306e5f0ed5ed5570c53c69a8ce2315..79d67bd507fa6c5bd779defa312943b1d99a47f6 100644 (file)
@@ -814,12 +814,14 @@ int main(int argc, char **argv)
        read_relocs(fp);
        if (show_absolute_syms) {
                print_absolute_symbols();
-               return 0;
+               goto out;
        }
        if (show_absolute_relocs) {
                print_absolute_relocs();
-               return 0;
+               goto out;
        }
        emit_relocs(as_text, use_real_mode);
+out:
+       fclose(fp);
        return 0;
 }
index 01de35c772210120075300504189c22bd00c5899..f5e86eee4e0ec9c7b80c94433f4ace0be0fe7403 100644 (file)
@@ -1178,20 +1178,6 @@ static void xen_exit_mmap(struct mm_struct *mm)
 
 static void xen_post_allocator_init(void);
 
-static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
-{
-       /* reserve the range used */
-       native_pagetable_reserve(start, end);
-
-       /* set as RW the rest */
-       printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
-                       PFN_PHYS(pgt_buf_top));
-       while (end < PFN_PHYS(pgt_buf_top)) {
-               make_lowmem_page_readwrite(__va(end));
-               end += PAGE_SIZE;
-       }
-}
-
 #ifdef CONFIG_X86_64
 static void __init xen_cleanhighmap(unsigned long vaddr,
                                    unsigned long vaddr_end)
@@ -1503,19 +1489,6 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 #else /* CONFIG_X86_64 */
 static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 {
-       unsigned long pfn = pte_pfn(pte);
-
-       /*
-        * If the new pfn is within the range of the newly allocated
-        * kernel pagetable, and it isn't being mapped into an
-        * early_ioremap fixmap slot as a freshly allocated page, make sure
-        * it is RO.
-        */
-       if (((!is_early_ioremap_ptep(ptep) &&
-                       pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
-                       (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
-               pte = pte_wrprotect(pte);
-
        return pte;
 }
 #endif /* CONFIG_X86_64 */
@@ -2197,7 +2170,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
 
 void __init xen_init_mmu_ops(void)
 {
-       x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
        x86_init.paging.pagetable_init = xen_pagetable_init;
        pv_mmu_ops = xen_mmu_ops;
 
index af47e7594460a98afad10a5bb89e376fff509e7f..1d94316f0ea46616ceda930896af210d7ad68284 100644 (file)
@@ -231,7 +231,9 @@ retry:
        }
        start_dma_addr = xen_virt_to_bus(xen_io_tlb_start);
        if (early) {
-               swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose);
+               if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs,
+                        verbose))
+                       panic("Cannot allocate SWIOTLB buffer");
                rc = 0;
        } else
                rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs);
index 3f778c27f8259aa91452fa85895890bcc7007f4f..3cd16ba82f15119701ebd477320304d714aba971 100644 (file)
@@ -99,6 +99,9 @@ void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 extern void *__alloc_bootmem_low(unsigned long size,
                                 unsigned long align,
                                 unsigned long goal);
+void *__alloc_bootmem_low_nopanic(unsigned long size,
+                                unsigned long align,
+                                unsigned long goal);
 extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
                                      unsigned long size,
                                      unsigned long align,
@@ -132,6 +135,8 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 
 #define alloc_bootmem_low(x) \
        __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
+#define alloc_bootmem_low_pages_nopanic(x) \
+       __alloc_bootmem_low_nopanic(x, PAGE_SIZE, 0)
 #define alloc_bootmem_low_pages(x) \
        __alloc_bootmem_low(x, PAGE_SIZE, 0)
 #define alloc_bootmem_low_pages_node(pgdat, x) \
index d0b8458a703a25503bcfe75f39ef804f36636aea..d2e6927bbaae1eb65b5b1dd2575643a5562e329a 100644 (file)
@@ -191,6 +191,7 @@ extern struct kimage *kexec_crash_image;
 /* Location of a reserved region to hold the crash kernel.
  */
 extern struct resource crashk_res;
+extern struct resource crashk_low_res;
 typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
 extern note_buf_t __percpu *crash_notes;
 extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
@@ -199,6 +200,8 @@ extern size_t vmcoreinfo_max_size;
 
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
                unsigned long long *crash_size, unsigned long long *crash_base);
+int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
+               unsigned long long *crash_size, unsigned long long *crash_base);
 int crash_shrink_memory(unsigned long new_size);
 size_t crash_get_memory_size(void);
 void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
index d452ee191066456278080c92322e093986baa1fb..f388203db7e85b421bfb58d3963f2edbc315c70c 100644 (file)
@@ -155,6 +155,7 @@ phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
 phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
                                  phys_addr_t max_addr);
 phys_addr_t memblock_phys_mem_size(void);
+phys_addr_t memblock_mem_size(unsigned long limit_pfn);
 phys_addr_t memblock_start_of_DRAM(void);
 phys_addr_t memblock_end_of_DRAM(void);
 void memblock_enforce_memory_limit(phys_addr_t memory_limit);
index 66e2f7c61e5c9d3a2924389e28ffa9d32c8728bf..9d9dcc35d6a1b00208cd3842a4791c192a0c4118 100644 (file)
@@ -1386,7 +1386,6 @@ extern void __init mmap_init(void);
 extern void show_mem(unsigned int flags);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
-extern int after_bootmem;
 
 extern __printf(3, 4)
 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...);
index 071d62c214a6e0b0cb91e3022513c45169d06a6b..2de42f9401d2599d4309028297c29a3bff9222c8 100644 (file)
@@ -23,7 +23,7 @@ extern int swiotlb_force;
 #define IO_TLB_SHIFT 11
 
 extern void swiotlb_init(int verbose);
-extern void swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
+int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
 extern unsigned long swiotlb_nr_tbl(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
 
index 5e4bd7864c5dedf836a7c85cc1ed8e3c4e31e6cb..2436ffcec91f0de58543e259c3dac620a26da36b 100644 (file)
@@ -54,6 +54,12 @@ struct resource crashk_res = {
        .end   = 0,
        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
 };
+struct resource crashk_low_res = {
+       .name  = "Crash kernel low",
+       .start = 0,
+       .end   = 0,
+       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
 
 int kexec_should_crash(struct task_struct *p)
 {
@@ -1369,10 +1375,11 @@ static int __init parse_crashkernel_simple(char                 *cmdline,
  * That function is the entry point for command line parsing and should be
  * called from the arch-specific code.
  */
-int __init parse_crashkernel(char               *cmdline,
+static int __init __parse_crashkernel(char *cmdline,
                             unsigned long long system_ram,
                             unsigned long long *crash_size,
-                            unsigned long long *crash_base)
+                            unsigned long long *crash_base,
+                               const char *name)
 {
        char    *p = cmdline, *ck_cmdline = NULL;
        char    *first_colon, *first_space;
@@ -1382,16 +1389,16 @@ int __init parse_crashkernel(char                *cmdline,
        *crash_base = 0;
 
        /* find crashkernel and use the last one if there are more */
-       p = strstr(p, "crashkernel=");
+       p = strstr(p, name);
        while (p) {
                ck_cmdline = p;
-               p = strstr(p+1, "crashkernel=");
+               p = strstr(p+1, name);
        }
 
        if (!ck_cmdline)
                return -EINVAL;
 
-       ck_cmdline += 12; /* strlen("crashkernel=") */
+       ck_cmdline += strlen(name);
 
        /*
         * if the commandline contains a ':', then that's the extended
@@ -1409,6 +1416,23 @@ int __init parse_crashkernel(char                 *cmdline,
        return 0;
 }
 
+int __init parse_crashkernel(char *cmdline,
+                            unsigned long long system_ram,
+                            unsigned long long *crash_size,
+                            unsigned long long *crash_base)
+{
+       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                       "crashkernel=");
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+                            unsigned long long system_ram,
+                            unsigned long long *crash_size,
+                            unsigned long long *crash_base)
+{
+       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                       "crashkernel_low=");
+}
 
 static void update_vmcoreinfo_note(void)
 {
index 196b06984decbc0263b74eda9f34e94f425eb070..bfe02b8fc55b3d1c383832148e4e5aa44e0ef87c 100644 (file)
@@ -122,11 +122,18 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
        return phys_to_dma(hwdev, virt_to_phys(address));
 }
 
+static bool no_iotlb_memory;
+
 void swiotlb_print_info(void)
 {
        unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
        unsigned char *vstart, *vend;
 
+       if (no_iotlb_memory) {
+               pr_warn("software IO TLB: No low mem\n");
+               return;
+       }
+
        vstart = phys_to_virt(io_tlb_start);
        vend = phys_to_virt(io_tlb_end);
 
@@ -136,7 +143,7 @@ void swiotlb_print_info(void)
               bytes >> 20, vstart, vend - 1);
 }
 
-void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 {
        void *v_overflow_buffer;
        unsigned long i, bytes;
@@ -150,9 +157,10 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
        /*
         * Get the overflow emergency buffer
         */
-       v_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow));
+       v_overflow_buffer = alloc_bootmem_low_pages_nopanic(
+                                               PAGE_ALIGN(io_tlb_overflow));
        if (!v_overflow_buffer)
-               panic("Cannot allocate SWIOTLB overflow buffer!\n");
+               return -ENOMEM;
 
        io_tlb_overflow_buffer = __pa(v_overflow_buffer);
 
@@ -169,15 +177,19 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 
        if (verbose)
                swiotlb_print_info();
+
+       return 0;
 }
 
 /*
  * Statically reserve bounce buffer space and initialize bounce buffer data
  * structures for the software IO TLB used to implement the DMA API.
  */
-static void __init
-swiotlb_init_with_default_size(size_t default_size, int verbose)
+void  __init
+swiotlb_init(int verbose)
 {
+       /* default to 64MB */
+       size_t default_size = 64UL<<20;
        unsigned char *vstart;
        unsigned long bytes;
 
@@ -188,20 +200,16 @@ swiotlb_init_with_default_size(size_t default_size, int verbose)
 
        bytes = io_tlb_nslabs << IO_TLB_SHIFT;
 
-       /*
-        * Get IO TLB memory from the low pages
-        */
-       vstart = alloc_bootmem_low_pages(PAGE_ALIGN(bytes));
-       if (!vstart)
-               panic("Cannot allocate SWIOTLB buffer");
-
-       swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose);
-}
+       /* Get IO TLB memory from the low pages */
+       vstart = alloc_bootmem_low_pages_nopanic(PAGE_ALIGN(bytes));
+       if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))
+               return;
 
-void __init
-swiotlb_init(int verbose)
-{
-       swiotlb_init_with_default_size(64 * (1<<20), verbose);  /* default to 64MB */
+       if (io_tlb_start)
+               free_bootmem(io_tlb_start,
+                                PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+       pr_warn("Cannot allocate SWIOTLB buffer");
+       no_iotlb_memory = true;
 }
 
 /*
@@ -405,6 +413,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
        unsigned long offset_slots;
        unsigned long max_slots;
 
+       if (no_iotlb_memory)
+               panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+
        mask = dma_get_seg_boundary(hwdev);
 
        tbl_dma_addr &= mask;
index b93376c39b61308fe2ef7de2466e83306c865cf9..2b0bcb019ec222b8d56be811866e421a6287f13b 100644 (file)
@@ -833,6 +833,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
        return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
 }
 
+void * __init __alloc_bootmem_low_nopanic(unsigned long size,
+                                         unsigned long align,
+                                         unsigned long goal)
+{
+       return ___alloc_bootmem_nopanic(size, align, goal,
+                                       ARCH_LOW_ADDRESS_LIMIT);
+}
+
 /**
  * __alloc_bootmem_low_node - allocate low boot memory from a specific node
  * @pgdat: node to allocate from
index 88adc8afb6103d6c1ba2eb7e20c1ed226cf2b1d3..b8d9147e5c084de3264fc49ea93c95e22eddf49d 100644 (file)
@@ -828,6 +828,23 @@ phys_addr_t __init memblock_phys_mem_size(void)
        return memblock.memory.total_size;
 }
 
+phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
+{
+       unsigned long pages = 0;
+       struct memblock_region *r;
+       unsigned long start_pfn, end_pfn;
+
+       for_each_memblock(memory, r) {
+               start_pfn = memblock_region_memory_base_pfn(r);
+               end_pfn = memblock_region_memory_end_pfn(r);
+               start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
+               end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
+               pages += end_pfn - start_pfn;
+       }
+
+       return (phys_addr_t)pages << PAGE_SHIFT;
+}
+
 /* lowest address */
 phys_addr_t __init_memblock memblock_start_of_DRAM(void)
 {
index b8294fc03df869153378f47f41f0ecd595c10887..5e07d36e381e6a568bd4770ce4c5079d1547ee56 100644 (file)
@@ -153,21 +153,6 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
                        z->managed_pages = 0;
 }
 
-/**
- * free_all_bootmem_node - release a node's free pages to the buddy allocator
- * @pgdat: node to be released
- *
- * Returns the number of pages actually released.
- */
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
-{
-       register_page_bootmem_info_node(pgdat);
-       reset_node_lowmem_managed_pages(pgdat);
-
-       /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
-       return 0;
-}
-
 /**
  * free_all_bootmem - release free pages to the buddy allocator
  *
@@ -406,6 +391,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
        return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
 }
 
+void * __init __alloc_bootmem_low_nopanic(unsigned long size,
+                                         unsigned long align,
+                                         unsigned long goal)
+{
+       return ___alloc_bootmem_nopanic(size, align, goal,
+                                       ARCH_LOW_ADDRESS_LIMIT);
+}
+
 /**
  * __alloc_bootmem_low_node - allocate low boot memory from a specific node
  * @pgdat: node to allocate from