]> git.karo-electronics.de Git - mv-sheeva.git/blobdiff - arch/x86/mm/fault.c
Merge tag 'v2.6.37' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
[mv-sheeva.git] / arch / x86 / mm / fault.c
index 4c4508e8a2043015c1cce3eb49a46c0c435e9297..7d90ceb882a41ec55f0d8aea7b2c40cd806afd81 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/kprobes.h>             /* __kprobes, ...               */
 #include <linux/mmiotrace.h>           /* kmmio_handler, ...           */
 #include <linux/perf_event.h>          /* perf_sw_event                */
+#include <linux/hugetlb.h>             /* hstate_index_to_shift        */
 
 #include <asm/traps.h>                 /* dotraplinkage, ...           */
 #include <asm/pgalloc.h>               /* pgd_*(), ...                 */
@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
-                    struct task_struct *tsk)
+                    struct task_struct *tsk, int fault)
 {
+       unsigned lsb = 0;
        siginfo_t info;
 
        info.si_signo   = si_signo;
        info.si_errno   = 0;
        info.si_code    = si_code;
        info.si_addr    = (void __user *)address;
-       info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
+       if (fault & VM_FAULT_HWPOISON_LARGE)
+               lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 
+       if (fault & VM_FAULT_HWPOISON)
+               lsb = PAGE_SHIFT;
+       info.si_addr_lsb = lsb;
 
        force_sig_info(si_signo, &info, tsk);
 }
@@ -229,7 +235,16 @@ void vmalloc_sync_all(void)
 
                spin_lock_irqsave(&pgd_lock, flags);
                list_for_each_entry(page, &pgd_list, lru) {
-                       if (!vmalloc_sync_one(page_address(page), address))
+                       spinlock_t *pgt_lock;
+                       pmd_t *ret;
+
+                       pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+                       spin_lock(pgt_lock);
+                       ret = vmalloc_sync_one(page_address(page), address);
+                       spin_unlock(pgt_lock);
+
+                       if (!ret)
                                break;
                }
                spin_unlock_irqrestore(&pgd_lock, flags);
@@ -251,6 +266,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
        if (!(address >= VMALLOC_START && address < VMALLOC_END))
                return -1;
 
+       WARN_ON_ONCE(in_nmi());
+
        /*
         * Synchronize this task's top level page-table
         * with the 'reference' page table.
@@ -326,29 +343,7 @@ out:
 
 void vmalloc_sync_all(void)
 {
-       unsigned long address;
-
-       for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
-            address += PGDIR_SIZE) {
-
-               const pgd_t *pgd_ref = pgd_offset_k(address);
-               unsigned long flags;
-               struct page *page;
-
-               if (pgd_none(*pgd_ref))
-                       continue;
-
-               spin_lock_irqsave(&pgd_lock, flags);
-               list_for_each_entry(page, &pgd_list, lru) {
-                       pgd_t *pgd;
-                       pgd = (pgd_t *)page_address(page) + pgd_index(address);
-                       if (pgd_none(*pgd))
-                               set_pgd(pgd, *pgd_ref);
-                       else
-                               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-               }
-               spin_unlock_irqrestore(&pgd_lock, flags);
-       }
+       sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
 }
 
 /*
@@ -369,6 +364,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
        if (!(address >= VMALLOC_START && address < VMALLOC_END))
                return -1;
 
+       WARN_ON_ONCE(in_nmi());
+
        /*
         * Copy kernel mappings over when needed. This can also
         * happen within a race in page table update. In the later
@@ -731,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                tsk->thread.error_code  = error_code | (address >= TASK_SIZE);
                tsk->thread.trap_no     = 14;
 
-               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+               force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
 
                return;
        }
@@ -816,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
        tsk->thread.trap_no     = 14;
 
 #ifdef CONFIG_MEMORY_FAILURE
-       if (fault & VM_FAULT_HWPOISON) {
+       if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
                printk(KERN_ERR
        "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
                        tsk->comm, tsk->pid, address);
                code = BUS_MCEERR_AR;
        }
 #endif
-       force_sig_info_fault(SIGBUS, code, address, tsk);
+       force_sig_info_fault(SIGBUS, code, address, tsk, fault);
 }
 
 static noinline void
@@ -833,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
        if (fault & VM_FAULT_OOM) {
                out_of_memory(regs, error_code, address);
        } else {
-               if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
+               if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+                            VM_FAULT_HWPOISON_LARGE))
                        do_sigbus(regs, error_code, address, fault);
                else
                        BUG();
@@ -894,8 +892,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
        if (pmd_large(*pmd))
                return spurious_fault_check(error_code, (pte_t *) pmd);
 
+       /*
+        * Note: don't use pte_present() here, since it returns true
+        * if the _PAGE_PROTNONE bit is set.  However, this aliases the
+        * _PAGE_GLOBAL bit, which for kernel pages give false positives
+        * when CONFIG_DEBUG_PAGEALLOC is used.
+        */
        pte = pte_offset_kernel(pmd, address);
-       if (!pte_present(*pte))
+       if (!(pte_flags(*pte) & _PAGE_PRESENT))
                return 0;
 
        ret = spurious_fault_check(error_code, pte);
@@ -915,9 +919,9 @@ spurious_fault(unsigned long error_code, unsigned long address)
 int show_unhandled_signals = 1;
 
 static inline int
-access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
+access_error(unsigned long error_code, struct vm_area_struct *vma)
 {
-       if (write) {
+       if (error_code & PF_WRITE) {
                /* write, present and write, not present: */
                if (unlikely(!(vma->vm_flags & VM_WRITE)))
                        return 1;
@@ -952,8 +956,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
        struct task_struct *tsk;
        unsigned long address;
        struct mm_struct *mm;
-       int write;
        int fault;
+       int write = error_code & PF_WRITE;
+       unsigned int flags = FAULT_FLAG_ALLOW_RETRY |
+                                       (write ? FAULT_FLAG_WRITE : 0);
 
        tsk = current;
        mm = tsk->mm;
@@ -1064,6 +1070,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
                        bad_area_nosemaphore(regs, error_code, address);
                        return;
                }
+retry:
                down_read(&mm->mmap_sem);
        } else {
                /*
@@ -1107,9 +1114,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
         * we can handle it..
         */
 good_area:
-       write = error_code & PF_WRITE;
-
-       if (unlikely(access_error(error_code, write, vma))) {
+       if (unlikely(access_error(error_code, vma))) {
                bad_area_access_error(regs, error_code, address);
                return;
        }
@@ -1119,21 +1124,34 @@ good_area:
         * make sure we exit gracefully rather than endlessly redo
         * the fault:
         */
-       fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
+       fault = handle_mm_fault(mm, vma, address, flags);
 
        if (unlikely(fault & VM_FAULT_ERROR)) {
                mm_fault_error(regs, error_code, address, fault);
                return;
        }
 
-       if (fault & VM_FAULT_MAJOR) {
-               tsk->maj_flt++;
-               perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
-                                    regs, address);
-       } else {
-               tsk->min_flt++;
-               perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
-                                    regs, address);
+       /*
+        * Major/minor page fault accounting is only done on the
+        * initial attempt. If we go through a retry, it is extremely
+        * likely that the page will be found in page cache at that point.
+        */
+       if (flags & FAULT_FLAG_ALLOW_RETRY) {
+               if (fault & VM_FAULT_MAJOR) {
+                       tsk->maj_flt++;
+                       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+                                     regs, address);
+               } else {
+                       tsk->min_flt++;
+                       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+                                     regs, address);
+               }
+               if (fault & VM_FAULT_RETRY) {
+                       /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+                        * of starvation. */
+                       flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       goto retry;
+               }
        }
 
        check_v8086_mode(regs, address, tsk);