x86/mm, sched/core: Turn off IRQs in switch_mm()

[linux-beck.git] / arch / x86 / mm / tlb.c
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c

index 8f4cc3dfac322a2911ab2b2e8471c6e8e87d8af0..5643fd0b1a7d271da14dee848589d99437b25b49 100644 (file)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,8 @@
   *     Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
   */
  
+#ifdef CONFIG_SMP
+
  struct flush_tlb_info {
         struct mm_struct *flush_mm;
         unsigned long flush_start;
@@ -57,6 +59,118 @@ void leave_mm(int cpu)
  }
  EXPORT_SYMBOL_GPL(leave_mm);
  
+#endif /* CONFIG_SMP */
+
+void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+              struct task_struct *tsk)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       switch_mm_irqs_off(prev, next, tsk);
+       local_irq_restore(flags);
+}
+
+void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+                       struct task_struct *tsk)
+{
+       unsigned cpu = smp_processor_id();
+
+       if (likely(prev != next)) {
+#ifdef CONFIG_SMP
+               this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+               this_cpu_write(cpu_tlbstate.active_mm, next);
+#endif
+               cpumask_set_cpu(cpu, mm_cpumask(next));
+
+               /*
+                * Re-load page tables.
+                *
+                * This logic has an ordering constraint:
+                *
+                *  CPU 0: Write to a PTE for 'next'
+                *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
+                *  CPU 1: set bit 1 in next's mm_cpumask
+                *  CPU 1: load from the PTE that CPU 0 writes (implicit)
+                *
+                * We need to prevent an outcome in which CPU 1 observes
+                * the new PTE value and CPU 0 observes bit 1 clear in
+                * mm_cpumask.  (If that occurs, then the IPI will never
+                * be sent, and CPU 0's TLB will contain a stale entry.)
+                *
+                * The bad outcome can occur if either CPU's load is
+                * reordered before that CPU's store, so both CPUs must
+                * execute full barriers to prevent this from happening.
+                *
+                * Thus, switch_mm needs a full barrier between the
+                * store to mm_cpumask and any operation that could load
+                * from next->pgd.  TLB fills are special and can happen
+                * due to instruction fetches or for no reason at all,
+                * and neither LOCK nor MFENCE orders them.
+                * Fortunately, load_cr3() is serializing and gives the
+                * ordering guarantee we need.
+                *
+                */
+               load_cr3(next->pgd);
+
+               trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+
+               /* Stop flush ipis for the previous mm */
+               cpumask_clear_cpu(cpu, mm_cpumask(prev));
+
+               /* Load per-mm CR4 state */
+               load_mm_cr4(next);
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+               /*
+                * Load the LDT, if the LDT is different.
+                *
+                * It's possible that prev->context.ldt doesn't match
+                * the LDT register.  This can happen if leave_mm(prev)
+                * was called and then modify_ldt changed
+                * prev->context.ldt but suppressed an IPI to this CPU.
+                * In this case, prev->context.ldt != NULL, because we
+                * never set context.ldt to NULL while the mm still
+                * exists.  That means that next->context.ldt !=
+                * prev->context.ldt, because mms never share an LDT.
+                */
+               if (unlikely(prev->context.ldt != next->context.ldt))
+                       load_mm_ldt(next);
+#endif
+       }
+#ifdef CONFIG_SMP
+         else {
+               this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+               BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
+
+               if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
+                       /*
+                        * On established mms, the mm_cpumask is only changed
+                        * from irq context, from ptep_clear_flush() while in
+                        * lazy tlb mode, and here. Irqs are blocked during
+                        * schedule, protecting us from simultaneous changes.
+                        */
+                       cpumask_set_cpu(cpu, mm_cpumask(next));
+
+                       /*
+                        * We were in lazy tlb mode and leave_mm disabled
+                        * tlb flush IPI delivery. We must reload CR3
+                        * to make sure to use no freed page tables.
+                        *
+                        * As above, load_cr3() is serializing and orders TLB
+                        * fills with respect to the mm_cpumask write.
+                        */
+                       load_cr3(next->pgd);
+                       trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+                       load_mm_cr4(next);
+                       load_mm_ldt(next);
+               }
+       }
+#endif
+}
+
+#ifdef CONFIG_SMP
+
  /*
   * The flush IPI assumes that a thread switch happens in this order:
   * [cpu0: the cpu that switches]
@@ -104,10 +218,8 @@ static void flush_tlb_func(void *info)
  
         inc_irq_stat(irq_tlb_count);
  
-       if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
+       if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
                 return;
-       if (!f->flush_end)
-               f->flush_end = f->flush_start + PAGE_SIZE;
  
         count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
         if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
@@ -135,12 +247,20 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
                                  unsigned long end)
  {
         struct flush_tlb_info info;
+
+       if (end == 0)
+               end = start + PAGE_SIZE;
         info.flush_mm = mm;
         info.flush_start = start;
         info.flush_end = end;
  
         count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
-       trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start);
+       if (end == TLB_FLUSH_ALL)
+               trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
+       else
+               trace_tlb_flush(TLB_REMOTE_SEND_IPI,
+                               (end - start) >> PAGE_SHIFT);
+
         if (is_uv_system()) {
                 unsigned int cpu;
  
@@ -347,3 +467,5 @@ static int __init create_tlb_single_page_flush_ceiling(void)
         return 0;
  }
  late_initcall(create_tlb_single_page_flush_ceiling);
+
+#endif /* CONFIG_SMP */