sched/core: Optimize __schedule()

author Peter Zijlstra <peterz@infradead.org>

Tue, 13 Sep 2016 16:37:29 +0000 (18:37 +0200)

committer Ingo Molnar <mingo@kernel.org>

Thu, 22 Sep 2016 12:53:45 +0000 (14:53 +0200)
author Peter Zijlstra <peterz@infradead.org>
Tue, 13 Sep 2016 16:37:29 +0000 (18:37 +0200)
committer Ingo Molnar <mingo@kernel.org>
Thu, 22 Sep 2016 12:53:45 +0000 (14:53 +0200)
diff --git a/include/linux/kernel.h b/include/linux/kernel.h

index d96a6118d26a91b470a0fba9b7ad8b6b8b1d0087..74fd6f05bc5bf5afad4364f6118aebc1a7407913 100644 (file)
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -259,17 +259,14 @@ static inline void might_fault(void) { }
  extern struct atomic_notifier_head panic_notifier_list;
  extern long (*panic_blink)(int state);
  __printf(1, 2)
-void panic(const char *fmt, ...)
-       __noreturn __cold;
+void panic(const char *fmt, ...) __noreturn __cold;
  void nmi_panic(struct pt_regs *regs, const char *msg);
  extern void oops_enter(void);
  extern void oops_exit(void);
  void print_oops_end_marker(void);
  extern int oops_may_print(void);
-void do_exit(long error_code)
-       __noreturn;
-void complete_and_exit(struct completion *, long)
-       __noreturn;
+void do_exit(long error_code) __noreturn;
+void complete_and_exit(struct completion *, long) __noreturn;
  
  /* Internal, do not use. */
  int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
diff --git a/include/linux/sched.h b/include/linux/sched.h

index d75024053e9b525003bb92e805f5c3500d2f6ede..f00ee8e90a294bd8b9dadd01c0517f1d5cbe4e2d 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -448,6 +448,8 @@ static inline void io_schedule(void)
         io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
  }
  
+void __noreturn do_task_dead(void);
+
  struct nsproxy;
  struct user_namespace;
  
diff --git a/kernel/exit.c b/kernel/exit.c

index 091a78be3b09d5669d9c10b98f6300e4171d2413..1e1d913914c0f75d2656fbf9afa2db3a03447812 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -725,7 +725,7 @@ static void check_stack_usage(void)
  static inline void check_stack_usage(void) {}
  #endif
  
-void do_exit(long code)
+void __noreturn do_exit(long code)
  {
         struct task_struct *tsk = current;
         int group_dead;
@@ -882,29 +882,7 @@ void do_exit(long code)
         exit_rcu();
         TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
  
-       /*
-        * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
-        * when the following two conditions become true.
-        *   - There is race condition of mmap_sem (It is acquired by
-        *     exit_mm()), and
-        *   - SMI occurs before setting TASK_RUNINNG.
-        *     (or hypervisor of virtual machine switches to other guest)
-        *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
-        *
-        * To avoid it, we have to wait for releasing tsk->pi_lock which
-        * is held by try_to_wake_up()
-        */
-       smp_mb();
-       raw_spin_unlock_wait(&tsk->pi_lock);
-
-       /* causes final put_task_struct in finish_task_switch(). */
-       tsk->state = TASK_DEAD;
-       tsk->flags |= PF_NOFREEZE;      /* tell freezer to ignore us */
-       schedule();
-       BUG();
-       /* Avoid "noreturn function does return".  */
-       for (;;)
-               cpu_relax();    /* For when BUG is null */
+       do_task_dead();
  }
  EXPORT_SYMBOL_GPL(do_exit);
  
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index ff4e3c066dc2e6aee479aacff30556e50b7d5ec1..b2ec53c1a9746a54ac58d38509c927928b98fd05 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3331,17 +3331,6 @@ static void __sched notrace __schedule(bool preempt)
         rq = cpu_rq(cpu);
         prev = rq->curr;
  
-       /*
-        * do_exit() calls schedule() with preemption disabled as an exception;
-        * however we must fix that up, otherwise the next task will see an
-        * inconsistent (higher) preempt count.
-        *
-        * It also avoids the below schedule_debug() test from complaining
-        * about this.
-        */
-       if (unlikely(prev->state == TASK_DEAD))
-               preempt_enable_no_resched_notrace();
-
         schedule_debug(prev);
  
         if (sched_feat(HRTICK))
@@ -3409,6 +3398,33 @@ static void __sched notrace __schedule(bool preempt)
  }
  STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
  
+void __noreturn do_task_dead(void)
+{
+       /*
+        * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+        * when the following two conditions become true.
+        *   - There is race condition of mmap_sem (It is acquired by
+        *     exit_mm()), and
+        *   - SMI occurs before setting TASK_RUNINNG.
+        *     (or hypervisor of virtual machine switches to other guest)
+        *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+        *
+        * To avoid it, we have to wait for releasing tsk->pi_lock which
+        * is held by try_to_wake_up()
+        */
+       smp_mb();
+       raw_spin_unlock_wait(&current->pi_lock);
+
+       /* causes final put_task_struct in finish_task_switch(). */
+       __set_current_state(TASK_DEAD);
+       current->flags |= PF_NOFREEZE;  /* tell freezer to ignore us */
+       __schedule(false);
+       BUG();
+       /* Avoid "noreturn function does return".  */
+       for (;;)
+               cpu_relax();    /* For when BUG is null */
+}
+
  static inline void sched_submit_work(struct task_struct *tsk)
  {
         if (!tsk->state || tsk_is_pi_blocked(tsk))
author	Peter Zijlstra <peterz@infradead.org>
	Tue, 13 Sep 2016 16:37:29 +0000 (18:37 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Thu, 22 Sep 2016 12:53:45 +0000 (14:53 +0200)
include/linux/kernel.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history