mm: extended batches for generic mmu_gather

[mv-sheeva.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index 61e66f026563b4b473fc73922d58a984c490c827..17193d74f30284a66269da7bb8ec65901abadfd4 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -193,6 +193,83 @@ static void check_sync_rss_stat(struct task_struct *task)
  
  #endif
  
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+
+/*
+ * See the comment near struct mmu_table_batch.
+ */
+
+static void tlb_remove_table_smp_sync(void *arg)
+{
+       /* Simply deliver the interrupt */
+}
+
+static void tlb_remove_table_one(void *table)
+{
+       /*
+        * This isn't an RCU grace period and hence the page-tables cannot be
+        * assumed to be actually RCU-freed.
+        *
+        * It is however sufficient for software page-table walkers that rely on
+        * IRQ disabling. See the comment near struct mmu_table_batch.
+        */
+       smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+       __tlb_remove_table(table);
+}
+
+static void tlb_remove_table_rcu(struct rcu_head *head)
+{
+       struct mmu_table_batch *batch;
+       int i;
+
+       batch = container_of(head, struct mmu_table_batch, rcu);
+
+       for (i = 0; i < batch->nr; i++)
+               __tlb_remove_table(batch->tables[i]);
+
+       free_page((unsigned long)batch);
+}
+
+void tlb_table_flush(struct mmu_gather *tlb)
+{
+       struct mmu_table_batch **batch = &tlb->batch;
+
+       if (*batch) {
+               call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
+               *batch = NULL;
+       }
+}
+
+void tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+       struct mmu_table_batch **batch = &tlb->batch;
+
+       tlb->need_flush = 1;
+
+       /*
+        * When there's less then two users of this mm there cannot be a
+        * concurrent page-table walk.
+        */
+       if (atomic_read(&tlb->mm->mm_users) < 2) {
+               __tlb_remove_table(table);
+               return;
+       }
+
+       if (*batch == NULL) {
+               *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+               if (*batch == NULL) {
+                       tlb_remove_table_one(table);
+                       return;
+               }
+               (*batch)->nr = 0;
+       }
+       (*batch)->tables[(*batch)->nr++] = table;
+       if ((*batch)->nr == MAX_TABLE_BATCH)
+               tlb_table_flush(tlb);
+}
+
+#endif
+
  /*
   * If a p?d_bad entry is found while walking page tables, report
   * the error, before resetting entry to p?d_none.  Usually (but
@@ -912,12 +989,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                 long *zap_work, struct zap_details *details)
  {
         struct mm_struct *mm = tlb->mm;
+       int force_flush = 0;
         pte_t *pte;
         spinlock_t *ptl;
         int rss[NR_MM_COUNTERS];
  
+again:
         init_rss_vec(rss);
-
         pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
         arch_enter_lazy_mmu_mode();
         do {
@@ -974,7 +1052,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                         page_remove_rmap(page);
                         if (unlikely(page_mapcount(page) < 0))
                                 print_bad_pte(vma, addr, ptent, page);
-                       tlb_remove_page(tlb, page);
+                       force_flush = !__tlb_remove_page(tlb, page);
+                       if (force_flush)
+                               break;
                         continue;
                 }
                 /*
@@ -1001,6 +1081,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
         arch_leave_lazy_mmu_mode();
         pte_unmap_unlock(pte - 1, ptl);
  
+       /*
+        * mmu_gather ran out of room to batch pages, we break out of
+        * the PTE lock to avoid doing the potential expensive TLB invalidate
+        * and page-free while holding it.
+        */
+       if (force_flush) {
+               force_flush = 0;
+               tlb_flush_mmu(tlb);
+               if (addr != end)
+                       goto again;
+       }
+
         return addr;
  }
  
@@ -1121,17 +1213,14 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
   * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
   * drops the lock and schedules.
   */
-unsigned long unmap_vmas(struct mmu_gather **tlbp,
+unsigned long unmap_vmas(struct mmu_gather *tlb,
                 struct vm_area_struct *vma, unsigned long start_addr,
                 unsigned long end_addr, unsigned long *nr_accounted,
                 struct zap_details *details)
  {
         long zap_work = ZAP_BLOCK_SIZE;
-       unsigned long tlb_start = 0;    /* For tlb_finish_mmu */
-       int tlb_start_valid = 0;
         unsigned long start = start_addr;
         spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
-       int fullmm = (*tlbp)->fullmm;
         struct mm_struct *mm = vma->vm_mm;
  
         mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -1152,11 +1241,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                         untrack_pfn_vma(vma, 0, 0);
  
                 while (start != end) {
-                       if (!tlb_start_valid) {
-                               tlb_start = start;
-                               tlb_start_valid = 1;
-                       }
-
                         if (unlikely(is_vm_hugetlb_page(vma))) {
                                 /*
                                  * It is undesirable to test vma->vm_file as it
@@ -1177,7 +1261,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
  
                                 start = end;
                         } else
-                               start = unmap_page_range(*tlbp, vma,
+                               start = unmap_page_range(tlb, vma,
                                                 start, end, &zap_work, details);
  
                         if (zap_work > 0) {
@@ -1185,19 +1269,13 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                                 break;
                         }
  
-                       tlb_finish_mmu(*tlbp, tlb_start, start);
-
                         if (need_resched() ||
                                 (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
-                               if (i_mmap_lock) {
-                                       *tlbp = NULL;
+                               if (i_mmap_lock)
                                         goto out;
-                               }
                                 cond_resched();
                         }
  
-                       *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
-                       tlb_start_valid = 0;
                         zap_work = ZAP_BLOCK_SIZE;
                 }
         }
@@ -1217,16 +1295,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
                 unsigned long size, struct zap_details *details)
  {
         struct mm_struct *mm = vma->vm_mm;
-       struct mmu_gather *tlb;
+       struct mmu_gather tlb;
         unsigned long end = address + size;
         unsigned long nr_accounted = 0;
  
         lru_add_drain();
-       tlb = tlb_gather_mmu(mm, 0);
+       tlb_gather_mmu(&tlb, mm, 0);
         update_hiwater_rss(mm);
         end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
-       if (tlb)
-               tlb_finish_mmu(tlb, address, end);
+       tlb_finish_mmu(&tlb, address, end);
         return end;
  }
  
@@ -2966,7 +3043,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
                 if (prev && prev->vm_end == address)
                         return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
  
-               expand_stack(vma, address - PAGE_SIZE);
+               expand_downwards(vma, address - PAGE_SIZE);
         }
         if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
                 struct vm_area_struct *next = vma->vm_next;