]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - kernel/workqueue.c
workqueue: use irqsafe timer for delayed_work
[karo-tx-linux.git] / kernel / workqueue.c
index 9a3128dc67df450d201969bb5fa9a99773d99c1b..b394df8beaee6ea6288c351fbabb806208a62cb7 100644 (file)
 #include "workqueue_sched.h"
 
 enum {
-       /* global_cwq flags */
-       GCWQ_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
-       GCWQ_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
-       GCWQ_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
-       GCWQ_FREEZING           = 1 << 3,       /* freeze in progress */
-       GCWQ_HIGHPRI_PENDING    = 1 << 4,       /* highpri works on queue */
+       /*
+        * global_cwq flags
+        *
+        * A bound gcwq is either associated or disassociated with its CPU.
+        * While associated (!DISASSOCIATED), all workers are bound to the
+        * CPU and none has %WORKER_UNBOUND set and concurrency management
+        * is in effect.
+        *
+        * While DISASSOCIATED, the cpu may be offline and all workers have
+        * %WORKER_UNBOUND set and concurrency management disabled, and may
+        * be executing on any CPU.  The gcwq behaves as an unbound one.
+        *
+        * Note that DISASSOCIATED can be flipped only while holding
+        * managership of all pools on the gcwq to avoid changing binding
+        * state while create_worker() is in progress.
+        */
+       GCWQ_DISASSOCIATED      = 1 << 0,       /* cpu can't serve workers */
+       GCWQ_FREEZING           = 1 << 1,       /* freeze in progress */
+
+       /* pool flags */
+       POOL_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
 
        /* worker flags */
        WORKER_STARTED          = 1 << 0,       /* started */
        WORKER_DIE              = 1 << 1,       /* die die die */
        WORKER_IDLE             = 1 << 2,       /* is idle */
        WORKER_PREP             = 1 << 3,       /* preparing to run works */
-       WORKER_ROGUE            = 1 << 4,       /* not bound to any cpu */
        WORKER_REBIND           = 1 << 5,       /* mom is home, come back */
        WORKER_CPU_INTENSIVE    = 1 << 6,       /* cpu intensive */
        WORKER_UNBOUND          = 1 << 7,       /* worker is unbound */
 
-       WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
-                                 WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
+       WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
+                                 WORKER_CPU_INTENSIVE,
 
-       /* gcwq->trustee_state */
-       TRUSTEE_START           = 0,            /* start */
-       TRUSTEE_IN_CHARGE       = 1,            /* trustee in charge of gcwq */
-       TRUSTEE_BUTCHER         = 2,            /* butcher workers */
-       TRUSTEE_RELEASE         = 3,            /* release workers */
-       TRUSTEE_DONE            = 4,            /* trustee is done */
+       NR_WORKER_POOLS         = 2,            /* # worker pools per gcwq */
 
        BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
        BUSY_WORKER_HASH_SIZE   = 1 << BUSY_WORKER_HASH_ORDER,
@@ -84,13 +93,13 @@ enum {
                                                   (min two ticks) */
        MAYDAY_INTERVAL         = HZ / 10,      /* and then every 100ms */
        CREATE_COOLDOWN         = HZ,           /* time to breath after fail */
-       TRUSTEE_COOLDOWN        = HZ / 10,      /* for trustee draining */
 
        /*
         * Rescue workers are used only on emergencies and shared by
         * all cpus.  Give -20.
         */
        RESCUER_NICE_LEVEL      = -20,
+       HIGHPRI_NICE_LEVEL      = -20,
 };
 
 /*
@@ -115,6 +124,8 @@ enum {
  */
 
 struct global_cwq;
+struct worker_pool;
+struct idle_rebind;
 
 /*
  * The poor guys doing the actual heavy lifting.  All on-duty workers
@@ -131,12 +142,31 @@ struct worker {
        struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
        struct list_head        scheduled;      /* L: scheduled works */
        struct task_struct      *task;          /* I: worker task */
-       struct global_cwq       *gcwq;          /* I: the associated gcwq */
+       struct worker_pool      *pool;          /* I: the associated pool */
        /* 64 bytes boundary on 64bit, 32 on 32bit */
        unsigned long           last_active;    /* L: last active timestamp */
        unsigned int            flags;          /* X: flags */
        int                     id;             /* I: worker id */
-       struct work_struct      rebind_work;    /* L: rebind worker to cpu */
+
+       /* for rebinding worker to CPU */
+       struct idle_rebind      *idle_rebind;   /* L: for idle worker */
+       struct work_struct      rebind_work;    /* L: for busy worker */
+};
+
+struct worker_pool {
+       struct global_cwq       *gcwq;          /* I: the owning gcwq */
+       unsigned int            flags;          /* X: flags */
+
+       struct list_head        worklist;       /* L: list of pending works */
+       int                     nr_workers;     /* L: total number of workers */
+       int                     nr_idle;        /* L: currently idle ones */
+
+       struct list_head        idle_list;      /* X: list of idle workers */
+       struct timer_list       idle_timer;     /* L: worker idle timeout */
+       struct timer_list       mayday_timer;   /* L: SOS timer for workers */
+
+       struct mutex            manager_mutex;  /* mutex manager should hold */
+       struct ida              worker_ida;     /* L: for worker IDs */
 };
 
 /*
@@ -146,27 +176,17 @@ struct worker {
  */
 struct global_cwq {
        spinlock_t              lock;           /* the gcwq lock */
-       struct list_head        worklist;       /* L: list of pending works */
        unsigned int            cpu;            /* I: the associated cpu */
        unsigned int            flags;          /* L: GCWQ_* flags */
 
-       int                     nr_workers;     /* L: total number of workers */
-       int                     nr_idle;        /* L: currently idle ones */
-
-       /* workers are chained either in the idle_list or busy_hash */
-       struct list_head        idle_list;      /* X: list of idle workers */
+       /* workers are chained either in busy_hash or pool idle_list */
        struct hlist_head       busy_hash[BUSY_WORKER_HASH_SIZE];
                                                /* L: hash of busy workers */
 
-       struct timer_list       idle_timer;     /* L: worker idle timeout */
-       struct timer_list       mayday_timer;   /* L: SOS timer for dworkers */
-
-       struct ida              worker_ida;     /* L: for worker IDs */
+       struct worker_pool      pools[NR_WORKER_POOLS];
+                                               /* normal and highpri pools */
 
-       struct task_struct      *trustee;       /* L: for gcwq shutdown */
-       unsigned int            trustee_state;  /* L: trustee state */
-       wait_queue_head_t       trustee_wait;   /* trustee wait */
-       struct worker           *first_idle;    /* L: first idle worker */
+       wait_queue_head_t       rebind_hold;    /* rebind hold wait */
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -175,7 +195,7 @@ struct global_cwq {
  * aligned at two's power of the number of flag bits.
  */
 struct cpu_workqueue_struct {
-       struct global_cwq       *gcwq;          /* I: the associated gcwq */
+       struct worker_pool      *pool;          /* I: the associated pool */
        struct workqueue_struct *wq;            /* I: the owning workqueue */
        int                     work_color;     /* L: current color */
        int                     flush_color;    /* L: flushing color */
@@ -249,21 +269,23 @@ struct workqueue_struct {
 };
 
 struct workqueue_struct *system_wq __read_mostly;
-struct workqueue_struct *system_long_wq __read_mostly;
-struct workqueue_struct *system_nrt_wq __read_mostly;
-struct workqueue_struct *system_unbound_wq __read_mostly;
-struct workqueue_struct *system_freezable_wq __read_mostly;
-struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_wq);
+struct workqueue_struct *system_highpri_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_highpri_wq);
+struct workqueue_struct *system_long_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_long_wq);
-EXPORT_SYMBOL_GPL(system_nrt_wq);
+struct workqueue_struct *system_unbound_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_unbound_wq);
+struct workqueue_struct *system_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_freezable_wq);
-EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
 
+#define for_each_worker_pool(pool, gcwq)                               \
+       for ((pool) = &(gcwq)->pools[0];                                \
+            (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
+
 #define for_each_busy_worker(worker, i, pos, gcwq)                     \
        for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)                     \
                hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -444,7 +466,7 @@ static bool workqueue_freezing;             /* W: have wqs started freezing? */
  * try_to_wake_up().  Put it in a separate cacheline.
  */
 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
-static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
+static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
 
 /*
  * Global cpu workqueue and nr_running counter for unbound gcwq.  The
@@ -452,10 +474,17 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
  * workers have WORKER_UNBOUND set.
  */
 static struct global_cwq unbound_global_cwq;
-static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);      /* always 0 */
+static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
+       [0 ... NR_WORKER_POOLS - 1]     = ATOMIC_INIT(0),       /* always 0 */
+};
 
 static int worker_thread(void *__worker);
 
+static int worker_pool_pri(struct worker_pool *pool)
+{
+       return pool - pool->gcwq->pools;
+}
+
 static struct global_cwq *get_gcwq(unsigned int cpu)
 {
        if (cpu != WORK_CPU_UNBOUND)
@@ -464,12 +493,15 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
                return &unbound_global_cwq;
 }
 
-static atomic_t *get_gcwq_nr_running(unsigned int cpu)
+static atomic_t *get_pool_nr_running(struct worker_pool *pool)
 {
+       int cpu = pool->gcwq->cpu;
+       int idx = worker_pool_pri(pool);
+
        if (cpu != WORK_CPU_UNBOUND)
-               return &per_cpu(gcwq_nr_running, cpu);
+               return &per_cpu(pool_nr_running, cpu)[idx];
        else
-               return &unbound_gcwq_nr_running;
+               return &unbound_pool_nr_running[idx];
 }
 
 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
@@ -500,18 +532,24 @@ static int work_next_color(int color)
 }
 
 /*
- * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
- * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is
- * cleared and the work data contains the cpu number it was last on.
+ * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data
+ * contain the pointer to the queued cwq.  Once execution starts, the flag
+ * is cleared and the high bits contain OFFQ flags and CPU number.
+ *
+ * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling()
+ * and clear_work_data() can be used to set the cwq, cpu or clear
+ * work->data.  These functions should only be called while the work is
+ * owned - ie. while the PENDING bit is set.
  *
- * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
- * cwq, cpu or clear work->data.  These functions should only be
- * called while the work is owned - ie. while the PENDING bit is set.
+ * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to
+ * a work.  gcwq is available once the work has been queued anywhere after
+ * initialization until it is sync canceled.  cwq is available only while
+ * the work item is queued.
  *
- * get_work_[g]cwq() can be used to obtain the gcwq or cwq
- * corresponding to a work.  gcwq is available once the work has been
- * queued anywhere after initialization.  cwq is available only from
- * queueing until execution starts.
+ * %WORK_OFFQ_CANCELING is used to mark a work item which is being
+ * canceled.  While being canceled, a work item may have its PENDING set
+ * but stay off timer and worklist for arbitrarily long and nobody should
+ * try to steal the PENDING bit.
  */
 static inline void set_work_data(struct work_struct *work, unsigned long data,
                                 unsigned long flags)
@@ -528,13 +566,22 @@ static void set_work_cwq(struct work_struct *work,
                      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
 }
 
-static void set_work_cpu(struct work_struct *work, unsigned int cpu)
+static void set_work_cpu_and_clear_pending(struct work_struct *work,
+                                          unsigned int cpu)
 {
-       set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
+       /*
+        * The following wmb is paired with the implied mb in
+        * test_and_set_bit(PENDING) and ensures all updates to @work made
+        * here are visible to and precede any updates by the next PENDING
+        * owner.
+        */
+       smp_wmb();
+       set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0);
 }
 
 static void clear_work_data(struct work_struct *work)
 {
+       smp_wmb();      /* see set_work_cpu_and_clear_pending() */
        set_work_data(work, WORK_STRUCT_NO_CPU, 0);
 }
 
@@ -555,9 +602,9 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 
        if (data & WORK_STRUCT_CWQ)
                return ((struct cpu_workqueue_struct *)
-                       (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
+                       (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
 
-       cpu = data >> WORK_STRUCT_FLAG_BITS;
+       cpu = data >> WORK_OFFQ_CPU_SHIFT;
        if (cpu == WORK_CPU_NONE)
                return NULL;
 
@@ -565,61 +612,79 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
        return get_gcwq(cpu);
 }
 
+static void mark_work_canceling(struct work_struct *work)
+{
+       struct global_cwq *gcwq = get_work_gcwq(work);
+       unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
+
+       set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING,
+                     WORK_STRUCT_PENDING);
+}
+
+static bool work_is_canceling(struct work_struct *work)
+{
+       unsigned long data = atomic_long_read(&work->data);
+
+       return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING);
+}
+
 /*
- * Policy functions.  These define the policies on how the global
- * worker pool is managed.  Unless noted otherwise, these functions
- * assume that they're being called with gcwq->lock held.
+ * Policy functions.  These define the policies on how the global worker
+ * pools are managed.  Unless noted otherwise, these functions assume that
+ * they're being called with gcwq->lock held.
  */
 
-static bool __need_more_worker(struct global_cwq *gcwq)
+static bool __need_more_worker(struct worker_pool *pool)
 {
-       return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
-               gcwq->flags & GCWQ_HIGHPRI_PENDING;
+       return !atomic_read(get_pool_nr_running(pool));
 }
 
 /*
  * Need to wake up a worker?  Called from anything but currently
  * running workers.
+ *
+ * Note that, because unbound workers never contribute to nr_running, this
+ * function will always return %true for unbound gcwq as long as the
+ * worklist isn't empty.
  */
-static bool need_more_worker(struct global_cwq *gcwq)
+static bool need_more_worker(struct worker_pool *pool)
 {
-       return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
+       return !list_empty(&pool->worklist) && __need_more_worker(pool);
 }
 
 /* Can I start working?  Called from busy but !running workers. */
-static bool may_start_working(struct global_cwq *gcwq)
+static bool may_start_working(struct worker_pool *pool)
 {
-       return gcwq->nr_idle;
+       return pool->nr_idle;
 }
 
 /* Do I need to keep working?  Called from currently running workers. */
-static bool keep_working(struct global_cwq *gcwq)
+static bool keep_working(struct worker_pool *pool)
 {
-       atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+       atomic_t *nr_running = get_pool_nr_running(pool);
 
-       return !list_empty(&gcwq->worklist) &&
-               (atomic_read(nr_running) <= 1 ||
-                gcwq->flags & GCWQ_HIGHPRI_PENDING);
+       return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
 }
 
 /* Do we need a new worker?  Called from manager. */
-static bool need_to_create_worker(struct global_cwq *gcwq)
+static bool need_to_create_worker(struct worker_pool *pool)
 {
-       return need_more_worker(gcwq) && !may_start_working(gcwq);
+       return need_more_worker(pool) && !may_start_working(pool);
 }
 
 /* Do I need to be the manager? */
-static bool need_to_manage_workers(struct global_cwq *gcwq)
+static bool need_to_manage_workers(struct worker_pool *pool)
 {
-       return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
+       return need_to_create_worker(pool) ||
+               (pool->flags & POOL_MANAGE_WORKERS);
 }
 
 /* Do we have too many workers and should some go away? */
-static bool too_many_workers(struct global_cwq *gcwq)
+static bool too_many_workers(struct worker_pool *pool)
 {
-       bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
-       int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
-       int nr_busy = gcwq->nr_workers - nr_idle;
+       bool managing = mutex_is_locked(&pool->manager_mutex);
+       int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
+       int nr_busy = pool->nr_workers - nr_idle;
 
        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
@@ -629,26 +694,26 @@ static bool too_many_workers(struct global_cwq *gcwq)
  */
 
 /* Return the first worker.  Safe with preemption disabled */
-static struct worker *first_worker(struct global_cwq *gcwq)
+static struct worker *first_worker(struct worker_pool *pool)
 {
-       if (unlikely(list_empty(&gcwq->idle_list)))
+       if (unlikely(list_empty(&pool->idle_list)))
                return NULL;
 
-       return list_first_entry(&gcwq->idle_list, struct worker, entry);
+       return list_first_entry(&pool->idle_list, struct worker, entry);
 }
 
 /**
  * wake_up_worker - wake up an idle worker
- * @gcwq: gcwq to wake worker for
+ * @pool: worker pool to wake worker from
  *
- * Wake up the first idle worker of @gcwq.
+ * Wake up the first idle worker of @pool.
  *
  * CONTEXT:
  * spin_lock_irq(gcwq->lock).
  */
-static void wake_up_worker(struct global_cwq *gcwq)
+static void wake_up_worker(struct worker_pool *pool)
 {
-       struct worker *worker = first_worker(gcwq);
+       struct worker *worker = first_worker(pool);
 
        if (likely(worker))
                wake_up_process(worker->task);
@@ -670,7 +735,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
        struct worker *worker = kthread_data(task);
 
        if (!(worker->flags & WORKER_NOT_RUNNING))
-               atomic_inc(get_gcwq_nr_running(cpu));
+               atomic_inc(get_pool_nr_running(worker->pool));
 }
 
 /**
@@ -692,8 +757,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
                                       unsigned int cpu)
 {
        struct worker *worker = kthread_data(task), *to_wakeup = NULL;
-       struct global_cwq *gcwq = get_gcwq(cpu);
-       atomic_t *nr_running = get_gcwq_nr_running(cpu);
+       struct worker_pool *pool = worker->pool;
+       atomic_t *nr_running = get_pool_nr_running(pool);
 
        if (worker->flags & WORKER_NOT_RUNNING)
                return NULL;
@@ -706,14 +771,14 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
         * worklist not empty test sequence is in insert_work().
         * Please read comment there.
         *
-        * NOT_RUNNING is clear.  This means that trustee is not in
-        * charge and we're running on the local cpu w/ rq lock held
-        * and preemption disabled, which in turn means that none else
-        * could be manipulating idle_list, so dereferencing idle_list
-        * without gcwq lock is safe.
+        * NOT_RUNNING is clear.  This means that we're bound to and
+        * running on the local cpu w/ rq lock held and preemption
+        * disabled, which in turn means that none else could be
+        * manipulating idle_list, so dereferencing idle_list without gcwq
+        * lock is safe.
         */
-       if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
-               to_wakeup = first_worker(gcwq);
+       if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
+               to_wakeup = first_worker(pool);
        return to_wakeup ? to_wakeup->task : NULL;
 }
 
@@ -733,7 +798,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
                                    bool wakeup)
 {
-       struct global_cwq *gcwq = worker->gcwq;
+       struct worker_pool *pool = worker->pool;
 
        WARN_ON_ONCE(worker->task != current);
 
@@ -744,12 +809,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
         */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
-               atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+               atomic_t *nr_running = get_pool_nr_running(pool);
 
                if (wakeup) {
                        if (atomic_dec_and_test(nr_running) &&
-                           !list_empty(&gcwq->worklist))
-                               wake_up_worker(gcwq);
+                           !list_empty(&pool->worklist))
+                               wake_up_worker(pool);
                } else
                        atomic_dec(nr_running);
        }
@@ -769,7 +834,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
  */
 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 {
-       struct global_cwq *gcwq = worker->gcwq;
+       struct worker_pool *pool = worker->pool;
        unsigned int oflags = worker->flags;
 
        WARN_ON_ONCE(worker->task != current);
@@ -783,7 +848,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
-                       atomic_inc(get_gcwq_nr_running(gcwq->cpu));
+                       atomic_inc(get_pool_nr_running(pool));
 }
 
 /**
@@ -867,40 +932,191 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 }
 
 /**
- * gcwq_determine_ins_pos - find insertion position
- * @gcwq: gcwq of interest
- * @cwq: cwq a work is being queued for
+ * move_linked_works - move linked works to a list
+ * @work: start of series of works to be scheduled
+ * @head: target list to append @work to
+ * @nextp: out paramter for nested worklist walking
+ *
+ * Schedule linked works starting from @work to @head.  Work series to
+ * be scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor.
  *
- * A work for @cwq is about to be queued on @gcwq, determine insertion
- * position for the work.  If @cwq is for HIGHPRI wq, the work is
- * queued at the head of the queue but in FIFO order with respect to
- * other HIGHPRI works; otherwise, at the end of the queue.  This
- * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
- * there are HIGHPRI works pending.
+ * If @nextp is not NULL, it's updated to point to the next work of
+ * the last scheduled work.  This allows move_linked_works() to be
+ * nested inside outer list_for_each_entry_safe().
  *
  * CONTEXT:
  * spin_lock_irq(gcwq->lock).
+ */
+static void move_linked_works(struct work_struct *work, struct list_head *head,
+                             struct work_struct **nextp)
+{
+       struct work_struct *n;
+
+       /*
+        * Linked worklist will always end before the end of the list,
+        * use NULL for list head.
+        */
+       list_for_each_entry_safe_from(work, n, NULL, entry) {
+               list_move_tail(&work->entry, head);
+               if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
+                       break;
+       }
+
+       /*
+        * If we're already inside safe list traversal and have moved
+        * multiple works to the scheduled queue, the next position
+        * needs to be updated.
+        */
+       if (nextp)
+               *nextp = n;
+}
+
+static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+{
+       struct work_struct *work = list_first_entry(&cwq->delayed_works,
+                                                   struct work_struct, entry);
+
+       trace_workqueue_activate_work(work);
+       move_linked_works(work, &cwq->pool->worklist, NULL);
+       __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
+       cwq->nr_active++;
+}
+
+/**
+ * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
+ * @cwq: cwq of interest
+ * @color: color of work which left the queue
+ * @delayed: for a delayed work
  *
- * RETURNS:
- * Pointer to inserstion position.
+ * A work either has completed or is removed from pending queue,
+ * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
  */
-static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
-                                              struct cpu_workqueue_struct *cwq)
+static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
+                                bool delayed)
 {
-       struct work_struct *twork;
+       /* ignore uncolored works */
+       if (color == WORK_NO_COLOR)
+               return;
 
-       if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
-               return &gcwq->worklist;
+       cwq->nr_in_flight[color]--;
 
-       list_for_each_entry(twork, &gcwq->worklist, entry) {
-               struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
+       if (!delayed) {
+               cwq->nr_active--;
+               if (!list_empty(&cwq->delayed_works)) {
+                       /* one down, submit a delayed one */
+                       if (cwq->nr_active < cwq->max_active)
+                               cwq_activate_first_delayed(cwq);
+               }
+       }
 
-               if (!(tcwq->wq->flags & WQ_HIGHPRI))
-                       break;
+       /* is flush in progress and are we at the flushing tip? */
+       if (likely(cwq->flush_color != color))
+               return;
+
+       /* are there still in-flight works? */
+       if (cwq->nr_in_flight[color])
+               return;
+
+       /* this cwq is done, clear flush_color */
+       cwq->flush_color = -1;
+
+       /*
+        * If this was the last cwq, wake up the first flusher.  It
+        * will handle the rest.
+        */
+       if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
+               complete(&cwq->wq->first_flusher->done);
+}
+
+/**
+ * try_to_grab_pending - steal work item from worklist and disable irq
+ * @work: work item to steal
+ * @is_dwork: @work is a delayed_work
+ * @flags: place to store irq state
+ *
+ * Try to grab PENDING bit of @work.  This function can handle @work in any
+ * stable state - idle, on timer or on worklist.  Return values are
+ *
+ *  1          if @work was pending and we successfully stole PENDING
+ *  0          if @work was idle and we claimed PENDING
+ *  -EAGAIN    if PENDING couldn't be grabbed at the moment, safe to busy-retry
+ *  -ENOENT    if someone else is canceling @work, this state may persist
+ *             for arbitrarily long
+ *
+ * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
+ * interrupted while holding PENDING and @work off queue, irq must be
+ * disabled on entry.  This, combined with delayed_work->timer being
+ * irqsafe, ensures that we return -EAGAIN for finite short period of time.
+ *
+ * On successful return, >= 0, irq is disabled and the caller is
+ * responsible for releasing it using local_irq_restore(*@flags).
+ *
+ * This function is safe to call from any context including IRQ handler.
+ */
+static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
+                              unsigned long *flags)
+{
+       struct global_cwq *gcwq;
+
+       WARN_ON_ONCE(in_irq());
+
+       local_irq_save(*flags);
+
+       /* try to steal the timer if it exists */
+       if (is_dwork) {
+               struct delayed_work *dwork = to_delayed_work(work);
+
+               /*
+                * dwork->timer is irqsafe.  If del_timer() fails, it's
+                * guaranteed that the timer is not queued anywhere and not
+                * running on the local CPU.
+                */
+               if (likely(del_timer(&dwork->timer)))
+                       return 1;
        }
 
-       gcwq->flags |= GCWQ_HIGHPRI_PENDING;
-       return &twork->entry;
+       /* try to claim PENDING the normal way */
+       if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
+               return 0;
+
+       /*
+        * The queueing is in progress, or it is already queued. Try to
+        * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
+        */
+       gcwq = get_work_gcwq(work);
+       if (!gcwq)
+               goto fail;
+
+       spin_lock(&gcwq->lock);
+       if (!list_empty(&work->entry)) {
+               /*
+                * This work is queued, but perhaps we locked the wrong gcwq.
+                * In that case we must see the new value after rmb(), see
+                * insert_work()->wmb().
+                */
+               smp_rmb();
+               if (gcwq == get_work_gcwq(work)) {
+                       debug_work_deactivate(work);
+                       list_del_init(&work->entry);
+                       cwq_dec_nr_in_flight(get_work_cwq(work),
+                               get_work_color(work),
+                               *work_data_bits(work) & WORK_STRUCT_DELAYED);
+
+                       spin_unlock(&gcwq->lock);
+                       return 1;
+               }
+       }
+       spin_unlock(&gcwq->lock);
+fail:
+       local_irq_restore(*flags);
+       if (work_is_canceling(work))
+               return -ENOENT;
+       cpu_relax();
+       return -EAGAIN;
 }
 
 /**
@@ -920,7 +1136,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
                        struct work_struct *work, struct list_head *head,
                        unsigned int extra_flags)
 {
-       struct global_cwq *gcwq = cwq->gcwq;
+       struct worker_pool *pool = cwq->pool;
 
        /* we own @work, set data and link */
        set_work_cwq(work, cwq, extra_flags);
@@ -940,8 +1156,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
         */
        smp_mb();
 
-       if (__need_more_worker(gcwq))
-               wake_up_worker(gcwq);
+       if (__need_more_worker(pool))
+               wake_up_worker(pool);
 }
 
 /*
@@ -983,7 +1199,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        struct cpu_workqueue_struct *cwq;
        struct list_head *worklist;
        unsigned int work_flags;
-       unsigned long flags;
+       unsigned int req_cpu = cpu;
+
+       /*
+        * While a work item is PENDING && off queue, a task trying to
+        * steal the PENDING will busy-loop waiting for it to either get
+        * queued or lose PENDING.  Grabbing PENDING and queueing should
+        * happen with IRQ disabled.
+        */
+       WARN_ON_ONCE(!irqs_disabled());
 
        debug_work_activate(work);
 
@@ -996,21 +1220,22 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        if (!(wq->flags & WQ_UNBOUND)) {
                struct global_cwq *last_gcwq;
 
-               if (unlikely(cpu == WORK_CPU_UNBOUND))
+               if (cpu == WORK_CPU_UNBOUND)
                        cpu = raw_smp_processor_id();
 
                /*
-                * It's multi cpu.  If @wq is non-reentrant and @work
-                * was previously on a different cpu, it might still
-                * be running there, in which case the work needs to
-                * be queued on that cpu to guarantee non-reentrance.
+                * It's multi cpu.  If @work was previously on a different
+                * cpu, it might still be running there, in which case the
+                * work needs to be queued on that cpu to guarantee
+                * non-reentrancy.
                 */
                gcwq = get_gcwq(cpu);
-               if (wq->flags & WQ_NON_REENTRANT &&
-                   (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
+               last_gcwq = get_work_gcwq(work);
+
+               if (last_gcwq && last_gcwq != gcwq) {
                        struct worker *worker;
 
-                       spin_lock_irqsave(&last_gcwq->lock, flags);
+                       spin_lock(&last_gcwq->lock);
 
                        worker = find_worker_executing_work(last_gcwq, work);
 
@@ -1018,22 +1243,23 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
                                gcwq = last_gcwq;
                        else {
                                /* meh... not running there, queue here */
-                               spin_unlock_irqrestore(&last_gcwq->lock, flags);
-                               spin_lock_irqsave(&gcwq->lock, flags);
+                               spin_unlock(&last_gcwq->lock);
+                               spin_lock(&gcwq->lock);
                        }
-               } else
-                       spin_lock_irqsave(&gcwq->lock, flags);
+               } else {
+                       spin_lock(&gcwq->lock);
+               }
        } else {
                gcwq = get_gcwq(WORK_CPU_UNBOUND);
-               spin_lock_irqsave(&gcwq->lock, flags);
+               spin_lock(&gcwq->lock);
        }
 
        /* gcwq determined, get cwq and queue */
        cwq = get_cwq(gcwq->cpu, wq);
-       trace_workqueue_queue_work(cpu, cwq, work);
+       trace_workqueue_queue_work(req_cpu, cwq, work);
 
        if (WARN_ON(!list_empty(&work->entry))) {
-               spin_unlock_irqrestore(&gcwq->lock, flags);
+               spin_unlock(&gcwq->lock);
                return;
        }
 
@@ -1043,7 +1269,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        if (likely(cwq->nr_active < cwq->max_active)) {
                trace_workqueue_activate_work(work);
                cwq->nr_active++;
-               worklist = gcwq_determine_ins_pos(gcwq, cwq);
+               worklist = &cwq->pool->worklist;
        } else {
                work_flags |= WORK_STRUCT_DELAYED;
                worklist = &cwq->delayed_works;
@@ -1051,29 +1277,8 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 
        insert_work(cwq, work, worklist, work_flags);
 
-       spin_unlock_irqrestore(&gcwq->lock, flags);
-}
-
-/**
- * queue_work - queue work on a workqueue
- * @wq: workqueue to use
- * @work: work to queue
- *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
- *
- * We queue the work to the CPU on which it was submitted, but if the CPU dies
- * it can be processed by another CPU.
- */
-int queue_work(struct workqueue_struct *wq, struct work_struct *work)
-{
-       int ret;
-
-       ret = queue_work_on(get_cpu(), wq, work);
-       put_cpu();
-
-       return ret;
+       spin_unlock(&gcwq->lock);
 }
-EXPORT_SYMBOL_GPL(queue_work);
 
 /**
  * queue_work_on - queue work on specific cpu
@@ -1081,105 +1286,204 @@ EXPORT_SYMBOL_GPL(queue_work);
  * @wq: workqueue to use
  * @work: work to queue
  *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
+ * Returns %false if @work was already on a queue, %true otherwise.
  *
  * We queue the work to a specific CPU, the caller must ensure it
  * can't go away.
  */
-int
-queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
+bool queue_work_on(int cpu, struct workqueue_struct *wq,
+                  struct work_struct *work)
 {
-       int ret = 0;
+       bool ret = false;
+       unsigned long flags;
+
+       local_irq_save(flags);
 
        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
                __queue_work(cpu, wq, work);
-               ret = 1;
+               ret = true;
        }
+
+       local_irq_restore(flags);
        return ret;
 }
 EXPORT_SYMBOL_GPL(queue_work_on);
 
-static void delayed_work_timer_fn(unsigned long __data)
-{
-       struct delayed_work *dwork = (struct delayed_work *)__data;
-       struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
-
-       __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
-}
-
 /**
- * queue_delayed_work - queue work on a workqueue after delay
+ * queue_work - queue work on a workqueue
  * @wq: workqueue to use
- * @dwork: delayable work to queue
- * @delay: number of jiffies to wait before queueing
+ * @work: work to queue
  *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
+ * Returns %false if @work was already on a queue, %true otherwise.
+ *
+ * We queue the work to the CPU on which it was submitted, but if the CPU dies
+ * it can be processed by another CPU.
  */
-int queue_delayed_work(struct workqueue_struct *wq,
-                       struct delayed_work *dwork, unsigned long delay)
+bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
+{
+       return queue_work_on(WORK_CPU_UNBOUND, wq, work);
+}
+EXPORT_SYMBOL_GPL(queue_work);
+
+void delayed_work_timer_fn(unsigned long __data)
 {
-       if (delay == 0)
-               return queue_work(wq, &dwork->work);
+       struct delayed_work *dwork = (struct delayed_work *)__data;
+       struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
 
-       return queue_delayed_work_on(-1, wq, dwork, delay);
+       /* should have been called from irqsafe timer with irq already off */
+       __queue_work(dwork->cpu, cwq->wq, &dwork->work);
 }
-EXPORT_SYMBOL_GPL(queue_delayed_work);
+EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
 
-/**
- * queue_delayed_work_on - queue work on specific CPU after delay
- * @cpu: CPU number to execute work on
- * @wq: workqueue to use
- * @dwork: work to queue
- * @delay: number of jiffies to wait before queueing
- *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
- */
-int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
-                       struct delayed_work *dwork, unsigned long delay)
+static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
+                               struct delayed_work *dwork, unsigned long delay)
 {
-       int ret = 0;
        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;
+       unsigned int lcpu;
 
-       if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
-               unsigned int lcpu;
-
-               BUG_ON(timer_pending(timer));
-               BUG_ON(!list_empty(&work->entry));
+       WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
+                    timer->data != (unsigned long)dwork);
+       BUG_ON(timer_pending(timer));
+       BUG_ON(!list_empty(&work->entry));
 
-               timer_stats_timer_set_start_info(&dwork->timer);
+       timer_stats_timer_set_start_info(&dwork->timer);
+
+       /*
+        * This stores cwq for the moment, for the timer_fn.  Note that the
+        * work's gcwq is preserved to allow reentrance detection for
+        * delayed works.
+        */
+       if (!(wq->flags & WQ_UNBOUND)) {
+               struct global_cwq *gcwq = get_work_gcwq(work);
 
                /*
-                * This stores cwq for the moment, for the timer_fn.
-                * Note that the work's gcwq is preserved to allow
-                * reentrance detection for delayed works.
+                * If we cannot get the last gcwq from @work directly,
+                * select the last CPU such that it avoids unnecessarily
+                * triggering non-reentrancy check in __queue_work().
                 */
-               if (!(wq->flags & WQ_UNBOUND)) {
-                       struct global_cwq *gcwq = get_work_gcwq(work);
+               lcpu = cpu;
+               if (gcwq)
+                       lcpu = gcwq->cpu;
+               if (lcpu == WORK_CPU_UNBOUND)
+                       lcpu = raw_smp_processor_id();
+       } else {
+               lcpu = WORK_CPU_UNBOUND;
+       }
 
-                       if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
-                               lcpu = gcwq->cpu;
-                       else
-                               lcpu = raw_smp_processor_id();
-               } else
-                       lcpu = WORK_CPU_UNBOUND;
+       set_work_cwq(work, get_cwq(lcpu, wq), 0);
 
-               set_work_cwq(work, get_cwq(lcpu, wq), 0);
+       dwork->cpu = cpu;
+       timer->expires = jiffies + delay;
 
-               timer->expires = jiffies + delay;
-               timer->data = (unsigned long)dwork;
-               timer->function = delayed_work_timer_fn;
+       if (unlikely(cpu != WORK_CPU_UNBOUND))
+               add_timer_on(timer, cpu);
+       else
+               add_timer(timer);
+}
 
-               if (unlikely(cpu >= 0))
-                       add_timer_on(timer, cpu);
-               else
-                       add_timer(timer);
-               ret = 1;
+/**
+ * queue_delayed_work_on - queue work on specific CPU after delay
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @dwork: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns %false if @work was already on a queue, %true otherwise.  If
+ * @delay is zero and @dwork is idle, it will be scheduled for immediate
+ * execution.
+ */
+bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+                          struct delayed_work *dwork, unsigned long delay)
+{
+       struct work_struct *work = &dwork->work;
+       bool ret = false;
+       unsigned long flags;
+
+       if (!delay)
+               return queue_work_on(cpu, wq, &dwork->work);
+
+       /* read the comment in __queue_work() */
+       local_irq_save(flags);
+
+       if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+               __queue_delayed_work(cpu, wq, dwork, delay);
+               ret = true;
        }
+
+       local_irq_restore(flags);
        return ret;
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 
+/**
+ * queue_delayed_work - queue work on a workqueue after delay
+ * @wq: workqueue to use
+ * @dwork: delayable work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
+ */
+bool queue_delayed_work(struct workqueue_struct *wq,
+                       struct delayed_work *dwork, unsigned long delay)
+{
+       return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
+}
+EXPORT_SYMBOL_GPL(queue_delayed_work);
+
+/**
+ * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @dwork: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
+ * modify @dwork's timer so that it expires after @delay.  If @delay is
+ * zero, @work is guaranteed to be scheduled immediately regardless of its
+ * current state.
+ *
+ * Returns %false if @dwork was idle and queued, %true if @dwork was
+ * pending and its timer was modified.
+ *
+ * This function is safe to call from any context including IRQ handler.
+ * See try_to_grab_pending() for details.
+ */
+bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
+                        struct delayed_work *dwork, unsigned long delay)
+{
+       unsigned long flags;
+       int ret;
+
+       do {
+               ret = try_to_grab_pending(&dwork->work, true, &flags);
+       } while (unlikely(ret == -EAGAIN));
+
+       if (likely(ret >= 0)) {
+               __queue_delayed_work(cpu, wq, dwork, delay);
+               local_irq_restore(flags);
+       }
+
+       /* -ENOENT from try_to_grab_pending() becomes %true */
+       return ret;
+}
+EXPORT_SYMBOL_GPL(mod_delayed_work_on);
+
+/**
+ * mod_delayed_work - modify delay of or queue a delayed work
+ * @wq: workqueue to use
+ * @dwork: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * mod_delayed_work_on() on local CPU.
+ */
+bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
+                     unsigned long delay)
+{
+       return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
+}
+EXPORT_SYMBOL_GPL(mod_delayed_work);
+
 /**
  * worker_enter_idle - enter idle state
  * @worker: worker which is entering idle state
@@ -1192,7 +1496,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
  */
 static void worker_enter_idle(struct worker *worker)
 {
-       struct global_cwq *gcwq = worker->gcwq;
+       struct worker_pool *pool = worker->pool;
+       struct global_cwq *gcwq = pool->gcwq;
 
        BUG_ON(worker->flags & WORKER_IDLE);
        BUG_ON(!list_empty(&worker->entry) &&
@@ -1200,27 +1505,24 @@ static void worker_enter_idle(struct worker *worker)
 
        /* can't use worker_set_flags(), also called from start_worker() */
        worker->flags |= WORKER_IDLE;
-       gcwq->nr_idle++;
+       pool->nr_idle++;
        worker->last_active = jiffies;
 
        /* idle_list is LIFO */
-       list_add(&worker->entry, &gcwq->idle_list);
+       list_add(&worker->entry, &pool->idle_list);
 
-       if (likely(!(worker->flags & WORKER_ROGUE))) {
-               if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
-                       mod_timer(&gcwq->idle_timer,
-                                 jiffies + IDLE_WORKER_TIMEOUT);
-       } else
-               wake_up_all(&gcwq->trustee_wait);
+       if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
+               mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
 
        /*
-        * Sanity check nr_running.  Because trustee releases gcwq->lock
-        * between setting %WORKER_ROGUE and zapping nr_running, the
-        * warning may trigger spuriously.  Check iff trustee is idle.
+        * Sanity check nr_running.  Because gcwq_unbind_fn() releases
+        * gcwq->lock between setting %WORKER_UNBOUND and zapping
+        * nr_running, the warning may trigger spuriously.  Check iff
+        * unbind is not in progress.
         */
-       WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
-                    gcwq->nr_workers == gcwq->nr_idle &&
-                    atomic_read(get_gcwq_nr_running(gcwq->cpu)));
+       WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
+                    pool->nr_workers == pool->nr_idle &&
+                    atomic_read(get_pool_nr_running(pool)));
 }
 
 /**
@@ -1234,11 +1536,11 @@ static void worker_enter_idle(struct worker *worker)
  */
 static void worker_leave_idle(struct worker *worker)
 {
-       struct global_cwq *gcwq = worker->gcwq;
+       struct worker_pool *pool = worker->pool;
 
        BUG_ON(!(worker->flags & WORKER_IDLE));
        worker_clr_flags(worker, WORKER_IDLE);
-       gcwq->nr_idle--;
+       pool->nr_idle--;
        list_del_init(&worker->entry);
 }
 
@@ -1258,11 +1560,11 @@ static void worker_leave_idle(struct worker *worker)
  * verbatim as it's best effort and blocking and gcwq may be
  * [dis]associated in the meantime.
  *
- * This function tries set_cpus_allowed() and locks gcwq and verifies
- * the binding against GCWQ_DISASSOCIATED which is set during
- * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
- * idle state or fetches works without dropping lock, it can guarantee
- * the scheduling requirement described in the first paragraph.
+ * This function tries set_cpus_allowed() and locks gcwq and verifies the
+ * binding against %GCWQ_DISASSOCIATED which is set during
+ * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
+ * enters idle state or fetches works without dropping lock, it can
+ * guarantee the scheduling requirement described in the first paragraph.
  *
  * CONTEXT:
  * Might sleep.  Called without any lock but returns with gcwq->lock
@@ -1275,7 +1577,7 @@ static void worker_leave_idle(struct worker *worker)
 static bool worker_maybe_bind_and_lock(struct worker *worker)
 __acquires(&gcwq->lock)
 {
-       struct global_cwq *gcwq = worker->gcwq;
+       struct global_cwq *gcwq = worker->pool->gcwq;
        struct task_struct *task = worker->task;
 
        while (true) {
@@ -1308,16 +1610,40 @@ __acquires(&gcwq->lock)
        }
 }
 
+struct idle_rebind {
+       int                     cnt;            /* # workers to be rebound */
+       struct completion       done;           /* all workers rebound */
+};
+
+/*
+ * Rebind an idle @worker to its CPU.  During CPU onlining, this has to
+ * happen synchronously for idle workers.  worker_thread() will test
+ * %WORKER_REBIND before leaving idle and call this function.
+ */
+static void idle_worker_rebind(struct worker *worker)
+{
+       struct global_cwq *gcwq = worker->pool->gcwq;
+
+       /* CPU must be online at this point */
+       WARN_ON(!worker_maybe_bind_and_lock(worker));
+       if (!--worker->idle_rebind->cnt)
+               complete(&worker->idle_rebind->done);
+       spin_unlock_irq(&worker->pool->gcwq->lock);
+
+       /* we did our part, wait for rebind_workers() to finish up */
+       wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
+}
+
 /*
- * Function for worker->rebind_work used to rebind rogue busy workers
- * to the associated cpu which is coming back online.  This is
- * scheduled by cpu up but can race with other cpu hotplug operations
- * and may be executed twice without intervening cpu down.
+ * Function for @worker->rebind.work used to rebind unbound busy workers to
+ * the associated cpu which is coming back online.  This is scheduled by
+ * cpu up but can race with other cpu hotplug operations and may be
+ * executed twice without intervening cpu down.
  */
-static void worker_rebind_fn(struct work_struct *work)
+static void busy_worker_rebind_fn(struct work_struct *work)
 {
        struct worker *worker = container_of(work, struct worker, rebind_work);
-       struct global_cwq *gcwq = worker->gcwq;
+       struct global_cwq *gcwq = worker->pool->gcwq;
 
        if (worker_maybe_bind_and_lock(worker))
                worker_clr_flags(worker, WORKER_REBIND);
@@ -1325,6 +1651,122 @@ static void worker_rebind_fn(struct work_struct *work)
        spin_unlock_irq(&gcwq->lock);
 }
 
+/**
+ * rebind_workers - rebind all workers of a gcwq to the associated CPU
+ * @gcwq: gcwq of interest
+ *
+ * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
+ * is different for idle and busy ones.
+ *
+ * The idle ones should be rebound synchronously and idle rebinding should
+ * be complete before any worker starts executing work items with
+ * concurrency management enabled; otherwise, scheduler may oops trying to
+ * wake up non-local idle worker from wq_worker_sleeping().
+ *
+ * This is achieved by repeatedly requesting rebinding until all idle
+ * workers are known to have been rebound under @gcwq->lock and holding all
+ * idle workers from becoming busy until idle rebinding is complete.
+ *
+ * Once idle workers are rebound, busy workers can be rebound as they
+ * finish executing their current work items.  Queueing the rebind work at
+ * the head of their scheduled lists is enough.  Note that nr_running will
+ * be properbly bumped as busy workers rebind.
+ *
+ * On return, all workers are guaranteed to either be bound or have rebind
+ * work item scheduled.
+ */
+static void rebind_workers(struct global_cwq *gcwq)
+       __releases(&gcwq->lock) __acquires(&gcwq->lock)
+{
+       struct idle_rebind idle_rebind;
+       struct worker_pool *pool;
+       struct worker *worker;
+       struct hlist_node *pos;
+       int i;
+
+       lockdep_assert_held(&gcwq->lock);
+
+       for_each_worker_pool(pool, gcwq)
+               lockdep_assert_held(&pool->manager_mutex);
+
+       /*
+        * Rebind idle workers.  Interlocked both ways.  We wait for
+        * workers to rebind via @idle_rebind.done.  Workers will wait for
+        * us to finish up by watching %WORKER_REBIND.
+        */
+       init_completion(&idle_rebind.done);
+retry:
+       idle_rebind.cnt = 1;
+       INIT_COMPLETION(idle_rebind.done);
+
+       /* set REBIND and kick idle ones, we'll wait for these later */
+       for_each_worker_pool(pool, gcwq) {
+               list_for_each_entry(worker, &pool->idle_list, entry) {
+                       if (worker->flags & WORKER_REBIND)
+                               continue;
+
+                       /* morph UNBOUND to REBIND */
+                       worker->flags &= ~WORKER_UNBOUND;
+                       worker->flags |= WORKER_REBIND;
+
+                       idle_rebind.cnt++;
+                       worker->idle_rebind = &idle_rebind;
+
+                       /* worker_thread() will call idle_worker_rebind() */
+                       wake_up_process(worker->task);
+               }
+       }
+
+       if (--idle_rebind.cnt) {
+               spin_unlock_irq(&gcwq->lock);
+               wait_for_completion(&idle_rebind.done);
+               spin_lock_irq(&gcwq->lock);
+               /* busy ones might have become idle while waiting, retry */
+               goto retry;
+       }
+
+       /*
+        * All idle workers are rebound and waiting for %WORKER_REBIND to
+        * be cleared inside idle_worker_rebind().  Clear and release.
+        * Clearing %WORKER_REBIND from this foreign context is safe
+        * because these workers are still guaranteed to be idle.
+        */
+       for_each_worker_pool(pool, gcwq)
+               list_for_each_entry(worker, &pool->idle_list, entry)
+                       worker->flags &= ~WORKER_REBIND;
+
+       wake_up_all(&gcwq->rebind_hold);
+
+       /* rebind busy workers */
+       for_each_busy_worker(worker, i, pos, gcwq) {
+               struct work_struct *rebind_work = &worker->rebind_work;
+               struct workqueue_struct *wq;
+
+               /* morph UNBOUND to REBIND */
+               worker->flags &= ~WORKER_UNBOUND;
+               worker->flags |= WORKER_REBIND;
+
+               if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+                                    work_data_bits(rebind_work)))
+                       continue;
+
+               debug_work_activate(rebind_work);
+
+               /*
+                * wq doesn't really matter but let's keep @worker->pool
+                * and @cwq->pool consistent for sanity.
+                */
+               if (worker_pool_pri(worker->pool))
+                       wq = system_highpri_wq;
+               else
+                       wq = system_wq;
+
+               insert_work(get_cwq(gcwq->cpu, wq), rebind_work,
+                       worker->scheduled.next,
+                       work_color_to_flags(WORK_NO_COLOR));
+       }
+}
+
 static struct worker *alloc_worker(void)
 {
        struct worker *worker;
@@ -1333,7 +1775,7 @@ static struct worker *alloc_worker(void)
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
-               INIT_WORK(&worker->rebind_work, worker_rebind_fn);
+               INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
                /* on creation a worker is in !idle && prep state */
                worker->flags = WORKER_PREP;
        }
@@ -1342,10 +1784,9 @@ static struct worker *alloc_worker(void)
 
 /**
  * create_worker - create a new workqueue worker
- * @gcwq: gcwq the new worker will belong to
- * @bind: whether to set affinity to @cpu or not
+ * @pool: pool the new worker will belong to
  *
- * Create a new worker which is bound to @gcwq.  The returned worker
+ * Create a new worker which is bound to @pool.  The returned worker
  * can be started by calling start_worker() or destroyed using
  * destroy_worker().
  *
@@ -1355,16 +1796,17 @@ static struct worker *alloc_worker(void)
  * RETURNS:
  * Pointer to the newly created worker.
  */
-static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
+static struct worker *create_worker(struct worker_pool *pool)
 {
-       bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
+       struct global_cwq *gcwq = pool->gcwq;
+       const char *pri = worker_pool_pri(pool) ? "H" : "";
        struct worker *worker = NULL;
        int id = -1;
 
        spin_lock_irq(&gcwq->lock);
-       while (ida_get_new(&gcwq->worker_ida, &id)) {
+       while (ida_get_new(&pool->worker_ida, &id)) {
                spin_unlock_irq(&gcwq->lock);
-               if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
+               if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
                        goto fail;
                spin_lock_irq(&gcwq->lock);
        }
@@ -1374,38 +1816,43 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
        if (!worker)
                goto fail;
 
-       worker->gcwq = gcwq;
+       worker->pool = pool;
        worker->id = id;
 
-       if (!on_unbound_cpu)
+       if (gcwq->cpu != WORK_CPU_UNBOUND)
                worker->task = kthread_create_on_node(worker_thread,
-                                                     worker,
-                                                     cpu_to_node(gcwq->cpu),
-                                                     "kworker/%u:%d", gcwq->cpu, id);
+                                       worker, cpu_to_node(gcwq->cpu),
+                                       "kworker/%u:%d%s", gcwq->cpu, id, pri);
        else
                worker->task = kthread_create(worker_thread, worker,
-                                             "kworker/u:%d", id);
+                                             "kworker/u:%d%s", id, pri);
        if (IS_ERR(worker->task))
                goto fail;
 
+       if (worker_pool_pri(pool))
+               set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
+
        /*
-        * A rogue worker will become a regular one if CPU comes
-        * online later on.  Make sure every worker has
-        * PF_THREAD_BOUND set.
+        * Determine CPU binding of the new worker depending on
+        * %GCWQ_DISASSOCIATED.  The caller is responsible for ensuring the
+        * flag remains stable across this function.  See the comments
+        * above the flag definition for details.
+        *
+        * As an unbound worker may later become a regular one if CPU comes
+        * online, make sure every worker has %PF_THREAD_BOUND set.
         */
-       if (bind && !on_unbound_cpu)
+       if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
                kthread_bind(worker->task, gcwq->cpu);
-       else {
+       else {
                worker->task->flags |= PF_THREAD_BOUND;
-               if (on_unbound_cpu)
-                       worker->flags |= WORKER_UNBOUND;
+               worker->flags |= WORKER_UNBOUND;
        }
 
        return worker;
 fail:
        if (id >= 0) {
                spin_lock_irq(&gcwq->lock);
-               ida_remove(&gcwq->worker_ida, id);
+               ida_remove(&pool->worker_ida, id);
                spin_unlock_irq(&gcwq->lock);
        }
        kfree(worker);
@@ -1424,7 +1871,7 @@ fail:
 static void start_worker(struct worker *worker)
 {
        worker->flags |= WORKER_STARTED;
-       worker->gcwq->nr_workers++;
+       worker->pool->nr_workers++;
        worker_enter_idle(worker);
        wake_up_process(worker->task);
 }
@@ -1440,7 +1887,8 @@ static void start_worker(struct worker *worker)
  */
 static void destroy_worker(struct worker *worker)
 {
-       struct global_cwq *gcwq = worker->gcwq;
+       struct worker_pool *pool = worker->pool;
+       struct global_cwq *gcwq = pool->gcwq;
        int id = worker->id;
 
        /* sanity check frenzy */
@@ -1448,9 +1896,9 @@ static void destroy_worker(struct worker *worker)
        BUG_ON(!list_empty(&worker->scheduled));
 
        if (worker->flags & WORKER_STARTED)
-               gcwq->nr_workers--;
+               pool->nr_workers--;
        if (worker->flags & WORKER_IDLE)
-               gcwq->nr_idle--;
+               pool->nr_idle--;
 
        list_del_init(&worker->entry);
        worker->flags |= WORKER_DIE;
@@ -1461,29 +1909,30 @@ static void destroy_worker(struct worker *worker)
        kfree(worker);
 
        spin_lock_irq(&gcwq->lock);
-       ida_remove(&gcwq->worker_ida, id);
+       ida_remove(&pool->worker_ida, id);
 }
 
-static void idle_worker_timeout(unsigned long __gcwq)
+static void idle_worker_timeout(unsigned long __pool)
 {
-       struct global_cwq *gcwq = (void *)__gcwq;
+       struct worker_pool *pool = (void *)__pool;
+       struct global_cwq *gcwq = pool->gcwq;
 
        spin_lock_irq(&gcwq->lock);
 
-       if (too_many_workers(gcwq)) {
+       if (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;
 
                /* idle_list is kept in LIFO order, check the last one */
-               worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+               worker = list_entry(pool->idle_list.prev, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
 
                if (time_before(jiffies, expires))
-                       mod_timer(&gcwq->idle_timer, expires);
+                       mod_timer(&pool->idle_timer, expires);
                else {
                        /* it's been idle for too long, wake up manager */
-                       gcwq->flags |= GCWQ_MANAGE_WORKERS;
-                       wake_up_worker(gcwq);
+                       pool->flags |= POOL_MANAGE_WORKERS;
+                       wake_up_worker(pool);
                }
        }
 
@@ -1500,7 +1949,7 @@ static bool send_mayday(struct work_struct *work)
                return false;
 
        /* mayday mayday mayday */
-       cpu = cwq->gcwq->cpu;
+       cpu = cwq->pool->gcwq->cpu;
        /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
        if (cpu == WORK_CPU_UNBOUND)
                cpu = 0;
@@ -1509,37 +1958,38 @@ static bool send_mayday(struct work_struct *work)
        return true;
 }
 
-static void gcwq_mayday_timeout(unsigned long __gcwq)
+static void gcwq_mayday_timeout(unsigned long __pool)
 {
-       struct global_cwq *gcwq = (void *)__gcwq;
+       struct worker_pool *pool = (void *)__pool;
+       struct global_cwq *gcwq = pool->gcwq;
        struct work_struct *work;
 
        spin_lock_irq(&gcwq->lock);
 
-       if (need_to_create_worker(gcwq)) {
+       if (need_to_create_worker(pool)) {
                /*
                 * We've been trying to create a new worker but
                 * haven't been successful.  We might be hitting an
                 * allocation deadlock.  Send distress signals to
                 * rescuers.
                 */
-               list_for_each_entry(work, &gcwq->worklist, entry)
+               list_for_each_entry(work, &pool->worklist, entry)
                        send_mayday(work);
        }
 
        spin_unlock_irq(&gcwq->lock);
 
-       mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
+       mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
 
 /**
  * maybe_create_worker - create a new worker if necessary
- * @gcwq: gcwq to create a new worker for
+ * @pool: pool to create a new worker for
  *
- * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
+ * Create a new worker for @pool if necessary.  @pool is guaranteed to
  * have at least one idle worker on return from this function.  If
  * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
- * sent to all rescuers with works scheduled on @gcwq to resolve
+ * sent to all rescuers with works scheduled on @pool to resolve
  * possible allocation deadlock.
  *
  * On return, need_to_create_worker() is guaranteed to be false and
@@ -1554,52 +2004,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq)
  * false if no action was taken and gcwq->lock stayed locked, true
  * otherwise.
  */
-static bool maybe_create_worker(struct global_cwq *gcwq)
+static bool maybe_create_worker(struct worker_pool *pool)
 __releases(&gcwq->lock)
 __acquires(&gcwq->lock)
 {
-       if (!need_to_create_worker(gcwq))
+       struct global_cwq *gcwq = pool->gcwq;
+
+       if (!need_to_create_worker(pool))
                return false;
 restart:
        spin_unlock_irq(&gcwq->lock);
 
        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
-       mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+       mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
 
        while (true) {
                struct worker *worker;
 
-               worker = create_worker(gcwq, true);
+               worker = create_worker(pool);
                if (worker) {
-                       del_timer_sync(&gcwq->mayday_timer);
+                       del_timer_sync(&pool->mayday_timer);
                        spin_lock_irq(&gcwq->lock);
                        start_worker(worker);
-                       BUG_ON(need_to_create_worker(gcwq));
+                       BUG_ON(need_to_create_worker(pool));
                        return true;
                }
 
-               if (!need_to_create_worker(gcwq))
+               if (!need_to_create_worker(pool))
                        break;
 
                __set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(CREATE_COOLDOWN);
 
-               if (!need_to_create_worker(gcwq))
+               if (!need_to_create_worker(pool))
                        break;
        }
 
-       del_timer_sync(&gcwq->mayday_timer);
+       del_timer_sync(&pool->mayday_timer);
        spin_lock_irq(&gcwq->lock);
-       if (need_to_create_worker(gcwq))
+       if (need_to_create_worker(pool))
                goto restart;
        return true;
 }
 
 /**
  * maybe_destroy_worker - destroy workers which have been idle for a while
- * @gcwq: gcwq to destroy workers for
+ * @pool: pool to destroy workers for
  *
- * Destroy @gcwq workers which have been idle for longer than
+ * Destroy @pool workers which have been idle for longer than
  * IDLE_WORKER_TIMEOUT.
  *
  * LOCKING:
@@ -1610,19 +2062,19 @@ restart:
  * false if no action was taken and gcwq->lock stayed locked, true
  * otherwise.
  */
-static bool maybe_destroy_workers(struct global_cwq *gcwq)
+static bool maybe_destroy_workers(struct worker_pool *pool)
 {
        bool ret = false;
 
-       while (too_many_workers(gcwq)) {
+       while (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;
 
-               worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+               worker = list_entry(pool->idle_list.prev, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
 
                if (time_before(jiffies, expires)) {
-                       mod_timer(&gcwq->idle_timer, expires);
+                       mod_timer(&pool->idle_timer, expires);
                        break;
                }
 
@@ -1655,136 +2107,25 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq)
  */
 static bool manage_workers(struct worker *worker)
 {
-       struct global_cwq *gcwq = worker->gcwq;
+       struct worker_pool *pool = worker->pool;
        bool ret = false;
 
-       if (gcwq->flags & GCWQ_MANAGING_WORKERS)
+       if (!mutex_trylock(&pool->manager_mutex))
                return ret;
 
-       gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
-       gcwq->flags |= GCWQ_MANAGING_WORKERS;
+       pool->flags &= ~POOL_MANAGE_WORKERS;
 
        /*
         * Destroy and then create so that may_start_working() is true
         * on return.
         */
-       ret |= maybe_destroy_workers(gcwq);
-       ret |= maybe_create_worker(gcwq);
-
-       gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
-
-       /*
-        * The trustee might be waiting to take over the manager
-        * position, tell it we're done.
-        */
-       if (unlikely(gcwq->trustee))
-               wake_up_all(&gcwq->trustee_wait);
+       ret |= maybe_destroy_workers(pool);
+       ret |= maybe_create_worker(pool);
 
+       mutex_unlock(&pool->manager_mutex);
        return ret;
 }
 
-/**
- * move_linked_works - move linked works to a list
- * @work: start of series of works to be scheduled
- * @head: target list to append @work to
- * @nextp: out paramter for nested worklist walking
- *
- * Schedule linked works starting from @work to @head.  Work series to
- * be scheduled starts at @work and includes any consecutive work with
- * WORK_STRUCT_LINKED set in its predecessor.
- *
- * If @nextp is not NULL, it's updated to point to the next work of
- * the last scheduled work.  This allows move_linked_works() to be
- * nested inside outer list_for_each_entry_safe().
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- */
-static void move_linked_works(struct work_struct *work, struct list_head *head,
-                             struct work_struct **nextp)
-{
-       struct work_struct *n;
-
-       /*
-        * Linked worklist will always end before the end of the list,
-        * use NULL for list head.
-        */
-       list_for_each_entry_safe_from(work, n, NULL, entry) {
-               list_move_tail(&work->entry, head);
-               if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
-                       break;
-       }
-
-       /*
-        * If we're already inside safe list traversal and have moved
-        * multiple works to the scheduled queue, the next position
-        * needs to be updated.
-        */
-       if (nextp)
-               *nextp = n;
-}
-
-static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
-{
-       struct work_struct *work = list_first_entry(&cwq->delayed_works,
-                                                   struct work_struct, entry);
-       struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
-
-       trace_workqueue_activate_work(work);
-       move_linked_works(work, pos, NULL);
-       __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
-       cwq->nr_active++;
-}
-
-/**
- * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
- * @cwq: cwq of interest
- * @color: color of work which left the queue
- * @delayed: for a delayed work
- *
- * A work either has completed or is removed from pending queue,
- * decrement nr_in_flight of its cwq and handle workqueue flushing.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- */
-static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
-                                bool delayed)
-{
-       /* ignore uncolored works */
-       if (color == WORK_NO_COLOR)
-               return;
-
-       cwq->nr_in_flight[color]--;
-
-       if (!delayed) {
-               cwq->nr_active--;
-               if (!list_empty(&cwq->delayed_works)) {
-                       /* one down, submit a delayed one */
-                       if (cwq->nr_active < cwq->max_active)
-                               cwq_activate_first_delayed(cwq);
-               }
-       }
-
-       /* is flush in progress and are we at the flushing tip? */
-       if (likely(cwq->flush_color != color))
-               return;
-
-       /* are there still in-flight works? */
-       if (cwq->nr_in_flight[color])
-               return;
-
-       /* this cwq is done, clear flush_color */
-       cwq->flush_color = -1;
-
-       /*
-        * If this was the last cwq, wake up the first flusher.  It
-        * will handle the rest.
-        */
-       if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
-               complete(&cwq->wq->first_flusher->done);
-}
-
 /**
  * process_one_work - process single work
  * @worker: self
@@ -1804,7 +2145,8 @@ __releases(&gcwq->lock)
 __acquires(&gcwq->lock)
 {
        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
-       struct global_cwq *gcwq = cwq->gcwq;
+       struct worker_pool *pool = worker->pool;
+       struct global_cwq *gcwq = pool->gcwq;
        struct hlist_head *bwh = busy_worker_head(gcwq, work);
        bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
        work_func_t f = work->func;
@@ -1822,6 +2164,15 @@ __acquires(&gcwq->lock)
 
        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
 #endif
+       /*
+        * Ensure we're on the correct CPU.  DISASSOCIATED test is
+        * necessary to avoid spurious warnings from rescuers servicing the
+        * unbound or a disassociated gcwq.
+        */
+       WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
+                    !(gcwq->flags & GCWQ_DISASSOCIATED) &&
+                    raw_smp_processor_id() != gcwq->cpu);
+
        /*
         * A single work shouldn't be executed concurrently by
         * multiple workers on a single cpu.  Check whether anyone is
@@ -1834,32 +2185,15 @@ __acquires(&gcwq->lock)
                return;
        }
 
-       /* claim and process */
+       /* claim and dequeue */
        debug_work_deactivate(work);
        hlist_add_head(&worker->hentry, bwh);
        worker->current_work = work;
        worker->current_cwq = cwq;
        work_color = get_work_color(work);
 
-       /* record the current cpu number in the work data and dequeue */
-       set_work_cpu(work, gcwq->cpu);
        list_del_init(&work->entry);
 
-       /*
-        * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
-        * wake up another worker; otherwise, clear HIGHPRI_PENDING.
-        */
-       if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
-               struct work_struct *nwork = list_first_entry(&gcwq->worklist,
-                                               struct work_struct, entry);
-
-               if (!list_empty(&gcwq->worklist) &&
-                   get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
-                       wake_up_worker(gcwq);
-               else
-                       gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
-       }
-
        /*
         * CPU intensive works don't participate in concurrency
         * management.  They're the scheduler's responsibility.
@@ -1867,9 +2201,23 @@ __acquires(&gcwq->lock)
        if (unlikely(cpu_intensive))
                worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
 
+       /*
+        * Unbound gcwq isn't concurrency managed and work items should be
+        * executed ASAP.  Wake up another worker if necessary.
+        */
+       if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+               wake_up_worker(pool);
+
+       /*
+        * Record the last CPU and clear PENDING which should be the last
+        * update to @work.  Also, do this inside @gcwq->lock so that
+        * PENDING and queued state changes happen together while IRQ is
+        * disabled.
+        */
+       set_work_cpu_and_clear_pending(work, gcwq->cpu);
+
        spin_unlock_irq(&gcwq->lock);
 
-       work_clear_pending(work);
        lock_map_acquire_read(&cwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
        trace_workqueue_execute_start(work);
@@ -1883,11 +2231,9 @@ __acquires(&gcwq->lock)
        lock_map_release(&cwq->wq->lockdep_map);
 
        if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
-               printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
-                      "%s/0x%08x/%d\n",
-                      current->comm, preempt_count(), task_pid_nr(current));
-               printk(KERN_ERR "    last function: ");
-               print_symbol("%s\n", (unsigned long)f);
+               pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
+                      "     last function: %pf\n",
+                      current->comm, preempt_count(), task_pid_nr(current), f);
                debug_show_held_locks(current);
                dump_stack();
        }
@@ -1939,28 +2285,38 @@ static void process_scheduled_works(struct worker *worker)
 static int worker_thread(void *__worker)
 {
        struct worker *worker = __worker;
-       struct global_cwq *gcwq = worker->gcwq;
+       struct worker_pool *pool = worker->pool;
+       struct global_cwq *gcwq = pool->gcwq;
 
        /* tell the scheduler that this is a workqueue worker */
        worker->task->flags |= PF_WQ_WORKER;
 woke_up:
        spin_lock_irq(&gcwq->lock);
 
-       /* DIE can be set only while we're idle, checking here is enough */
-       if (worker->flags & WORKER_DIE) {
+       /*
+        * DIE can be set only while idle and REBIND set while busy has
+        * @worker->rebind_work scheduled.  Checking here is enough.
+        */
+       if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
                spin_unlock_irq(&gcwq->lock);
-               worker->task->flags &= ~PF_WQ_WORKER;
-               return 0;
+
+               if (worker->flags & WORKER_DIE) {
+                       worker->task->flags &= ~PF_WQ_WORKER;
+                       return 0;
+               }
+
+               idle_worker_rebind(worker);
+               goto woke_up;
        }
 
        worker_leave_idle(worker);
 recheck:
        /* no more worker necessary? */
-       if (!need_more_worker(gcwq))
+       if (!need_more_worker(pool))
                goto sleep;
 
        /* do we need to manage? */
-       if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
+       if (unlikely(!may_start_working(pool)) && manage_workers(worker))
                goto recheck;
 
        /*
@@ -1979,7 +2335,7 @@ recheck:
 
        do {
                struct work_struct *work =
-                       list_first_entry(&gcwq->worklist,
+                       list_first_entry(&pool->worklist,
                                         struct work_struct, entry);
 
                if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
@@ -1991,11 +2347,11 @@ recheck:
                        move_linked_works(work, &worker->scheduled, NULL);
                        process_scheduled_works(worker);
                }
-       } while (keep_working(gcwq));
+       } while (keep_working(pool));
 
        worker_set_flags(worker, WORKER_PREP, false);
 sleep:
-       if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
+       if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
                goto recheck;
 
        /*
@@ -2053,14 +2409,15 @@ repeat:
        for_each_mayday_cpu(cpu, wq->mayday_mask) {
                unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
                struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
-               struct global_cwq *gcwq = cwq->gcwq;
+               struct worker_pool *pool = cwq->pool;
+               struct global_cwq *gcwq = pool->gcwq;
                struct work_struct *work, *n;
 
                __set_current_state(TASK_RUNNING);
                mayday_clear_cpu(cpu, wq->mayday_mask);
 
                /* migrate to the target cpu if possible */
-               rescuer->gcwq = gcwq;
+               rescuer->pool = pool;
                worker_maybe_bind_and_lock(rescuer);
 
                /*
@@ -2068,7 +2425,7 @@ repeat:
                 * process'em.
                 */
                BUG_ON(!list_empty(&rescuer->scheduled));
-               list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
+               list_for_each_entry_safe(work, n, &pool->worklist, entry)
                        if (get_work_cwq(work) == cwq)
                                move_linked_works(work, scheduled, &n);
 
@@ -2079,8 +2436,8 @@ repeat:
                 * regular worker; otherwise, we end up with 0 concurrency
                 * and stalling the execution.
                 */
-               if (keep_working(gcwq))
-                       wake_up_worker(gcwq);
+               if (keep_working(pool))
+                       wake_up_worker(pool);
 
                spin_unlock_irq(&gcwq->lock);
        }
@@ -2205,7 +2562,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
 
        for_each_cwq_cpu(cpu, wq) {
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-               struct global_cwq *gcwq = cwq->gcwq;
+               struct global_cwq *gcwq = cwq->pool->gcwq;
 
                spin_lock_irq(&gcwq->lock);
 
@@ -2421,17 +2778,17 @@ reflush:
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
                bool drained;
 
-               spin_lock_irq(&cwq->gcwq->lock);
+               spin_lock_irq(&cwq->pool->gcwq->lock);
                drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
-               spin_unlock_irq(&cwq->gcwq->lock);
+               spin_unlock_irq(&cwq->pool->gcwq->lock);
 
                if (drained)
                        continue;
 
                if (++flush_cnt == 10 ||
                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
-                       pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
-                                  wq->name, flush_cnt);
+                       pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n",
+                               wq->name, flush_cnt);
                goto reflush;
        }
 
@@ -2442,8 +2799,7 @@ reflush:
 }
 EXPORT_SYMBOL_GPL(drain_workqueue);
 
-static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
-                            bool wait_executing)
+static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
 {
        struct worker *worker = NULL;
        struct global_cwq *gcwq;
@@ -2463,15 +2819,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
                 */
                smp_rmb();
                cwq = get_work_cwq(work);
-               if (unlikely(!cwq || gcwq != cwq->gcwq))
+               if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
                        goto already_gone;
-       } else if (wait_executing) {
+       } else {
                worker = find_worker_executing_work(gcwq, work);
                if (!worker)
                        goto already_gone;
                cwq = worker->current_cwq;
-       } else
-               goto already_gone;
+       }
 
        insert_wq_barrier(cwq, barr, work, worker);
        spin_unlock_irq(&gcwq->lock);
@@ -2491,168 +2846,57 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
        return true;
 already_gone:
        spin_unlock_irq(&gcwq->lock);
-       return false;
-}
-
-/**
- * flush_work - wait for a work to finish executing the last queueing instance
- * @work: the work to flush
- *
- * Wait until @work has finished execution.  This function considers
- * only the last queueing instance of @work.  If @work has been
- * enqueued across different CPUs on a non-reentrant workqueue or on
- * multiple workqueues, @work might still be executing on return on
- * some of the CPUs from earlier queueing.
- *
- * If @work was queued only on a non-reentrant, ordered or unbound
- * workqueue, @work is guaranteed to be idle on return if it hasn't
- * been requeued since flush started.
- *
- * RETURNS:
- * %true if flush_work() waited for the work to finish execution,
- * %false if it was already idle.
- */
-bool flush_work(struct work_struct *work)
-{
-       struct wq_barrier barr;
-
-       lock_map_acquire(&work->lockdep_map);
-       lock_map_release(&work->lockdep_map);
-
-       if (start_flush_work(work, &barr, true)) {
-               wait_for_completion(&barr.done);
-               destroy_work_on_stack(&barr.work);
-               return true;
-       } else
-               return false;
-}
-EXPORT_SYMBOL_GPL(flush_work);
-
-static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
-{
-       struct wq_barrier barr;
-       struct worker *worker;
-
-       spin_lock_irq(&gcwq->lock);
-
-       worker = find_worker_executing_work(gcwq, work);
-       if (unlikely(worker))
-               insert_wq_barrier(worker->current_cwq, &barr, work, worker);
-
-       spin_unlock_irq(&gcwq->lock);
-
-       if (unlikely(worker)) {
-               wait_for_completion(&barr.done);
-               destroy_work_on_stack(&barr.work);
-               return true;
-       } else
-               return false;
-}
-
-static bool wait_on_work(struct work_struct *work)
-{
-       bool ret = false;
-       int cpu;
-
-       might_sleep();
-
-       lock_map_acquire(&work->lockdep_map);
-       lock_map_release(&work->lockdep_map);
-
-       for_each_gcwq_cpu(cpu)
-               ret |= wait_on_cpu_work(get_gcwq(cpu), work);
-       return ret;
+       return false;
 }
 
 /**
- * flush_work_sync - wait until a work has finished execution
+ * flush_work - wait for a work to finish executing the last queueing instance
  * @work: the work to flush
  *
- * Wait until @work has finished execution.  On return, it's
- * guaranteed that all queueing instances of @work which happened
- * before this function is called are finished.  In other words, if
- * @work hasn't been requeued since this function was called, @work is
- * guaranteed to be idle on return.
+ * Wait until @work has finished execution.  @work is guaranteed to be idle
+ * on return if it hasn't been requeued since flush started.
  *
  * RETURNS:
- * %true if flush_work_sync() waited for the work to finish execution,
+ * %true if flush_work() waited for the work to finish execution,
  * %false if it was already idle.
  */
-bool flush_work_sync(struct work_struct *work)
+bool flush_work(struct work_struct *work)
 {
        struct wq_barrier barr;
-       bool pending, waited;
 
-       /* we'll wait for executions separately, queue barr only if pending */
-       pending = start_flush_work(work, &barr, false);
-
-       /* wait for executions to finish */
-       waited = wait_on_work(work);
+       lock_map_acquire(&work->lockdep_map);
+       lock_map_release(&work->lockdep_map);
 
-       /* wait for the pending one */
-       if (pending) {
+       if (start_flush_work(work, &barr)) {
                wait_for_completion(&barr.done);
                destroy_work_on_stack(&barr.work);
+               return true;
+       } else {
+               return false;
        }
-
-       return pending || waited;
-}
-EXPORT_SYMBOL_GPL(flush_work_sync);
-
-/*
- * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
- * so this work can't be re-armed in any way.
- */
-static int try_to_grab_pending(struct work_struct *work)
-{
-       struct global_cwq *gcwq;
-       int ret = -1;
-
-       if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
-               return 0;
-
-       /*
-        * The queueing is in progress, or it is already queued. Try to
-        * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
-        */
-       gcwq = get_work_gcwq(work);
-       if (!gcwq)
-               return ret;
-
-       spin_lock_irq(&gcwq->lock);
-       if (!list_empty(&work->entry)) {
-               /*
-                * This work is queued, but perhaps we locked the wrong gcwq.
-                * In that case we must see the new value after rmb(), see
-                * insert_work()->wmb().
-                */
-               smp_rmb();
-               if (gcwq == get_work_gcwq(work)) {
-                       debug_work_deactivate(work);
-                       list_del_init(&work->entry);
-                       cwq_dec_nr_in_flight(get_work_cwq(work),
-                               get_work_color(work),
-                               *work_data_bits(work) & WORK_STRUCT_DELAYED);
-                       ret = 1;
-               }
-       }
-       spin_unlock_irq(&gcwq->lock);
-
-       return ret;
 }
+EXPORT_SYMBOL_GPL(flush_work);
 
-static bool __cancel_work_timer(struct work_struct *work,
-                               struct timer_list* timer)
+static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 {
+       unsigned long flags;
        int ret;
 
        do {
-               ret = (timer && likely(del_timer(timer)));
-               if (!ret)
-                       ret = try_to_grab_pending(work);
-               wait_on_work(work);
+               ret = try_to_grab_pending(work, is_dwork, &flags);
+               /*
+                * If someone else is canceling, wait for the same event it
+                * would be waiting for before retrying.
+                */
+               if (unlikely(ret == -ENOENT))
+                       flush_work(work);
        } while (unlikely(ret < 0));
 
+       /* tell other tasks trying to grab @work to back off */
+       mark_work_canceling(work);
+       local_irq_restore(flags);
+
+       flush_work(work);
        clear_work_data(work);
        return ret;
 }
@@ -2677,7 +2921,7 @@ static bool __cancel_work_timer(struct work_struct *work,
  */
 bool cancel_work_sync(struct work_struct *work)
 {
-       return __cancel_work_timer(work, NULL);
+       return __cancel_work_timer(work, false);
 }
 EXPORT_SYMBOL_GPL(cancel_work_sync);
 
@@ -2695,34 +2939,15 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
  */
 bool flush_delayed_work(struct delayed_work *dwork)
 {
+       local_irq_disable();
        if (del_timer_sync(&dwork->timer))
-               __queue_work(raw_smp_processor_id(),
+               __queue_work(dwork->cpu,
                             get_work_cwq(&dwork->work)->wq, &dwork->work);
+       local_irq_enable();
        return flush_work(&dwork->work);
 }
 EXPORT_SYMBOL(flush_delayed_work);
 
-/**
- * flush_delayed_work_sync - wait for a dwork to finish
- * @dwork: the delayed work to flush
- *
- * Delayed timer is cancelled and the pending work is queued for
- * execution immediately.  Other than timer handling, its behavior
- * is identical to flush_work_sync().
- *
- * RETURNS:
- * %true if flush_work_sync() waited for the work to finish execution,
- * %false if it was already idle.
- */
-bool flush_delayed_work_sync(struct delayed_work *dwork)
-{
-       if (del_timer_sync(&dwork->timer))
-               __queue_work(raw_smp_processor_id(),
-                            get_work_cwq(&dwork->work)->wq, &dwork->work);
-       return flush_work_sync(&dwork->work);
-}
-EXPORT_SYMBOL(flush_delayed_work_sync);
-
 /**
  * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
  * @dwork: the delayed work cancel
@@ -2734,54 +2959,39 @@ EXPORT_SYMBOL(flush_delayed_work_sync);
  */
 bool cancel_delayed_work_sync(struct delayed_work *dwork)
 {
-       return __cancel_work_timer(&dwork->work, &dwork->timer);
+       return __cancel_work_timer(&dwork->work, true);
 }
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
 /**
- * schedule_work - put work task in global workqueue
- * @work: job to be done
- *
- * Returns zero if @work was already on the kernel-global workqueue and
- * non-zero otherwise.
- *
- * This puts a job in the kernel-global workqueue if it was not already
- * queued and leaves it in the same position on the kernel-global
- * workqueue otherwise.
- */
-int schedule_work(struct work_struct *work)
-{
-       return queue_work(system_wq, work);
-}
-EXPORT_SYMBOL(schedule_work);
-
-/*
  * schedule_work_on - put work task on a specific cpu
  * @cpu: cpu to put the work task on
  * @work: job to be done
  *
  * This puts a job on a specific cpu
  */
-int schedule_work_on(int cpu, struct work_struct *work)
+bool schedule_work_on(int cpu, struct work_struct *work)
 {
        return queue_work_on(cpu, system_wq, work);
 }
 EXPORT_SYMBOL(schedule_work_on);
 
 /**
- * schedule_delayed_work - put work task in global workqueue after delay
- * @dwork: job to be done
- * @delay: number of jiffies to wait or 0 for immediate execution
+ * schedule_work - put work task in global workqueue
+ * @work: job to be done
  *
- * After waiting for a given time this puts a job in the kernel-global
- * workqueue.
+ * Returns %false if @work was already on the kernel-global workqueue and
+ * %true otherwise.
+ *
+ * This puts a job in the kernel-global workqueue if it was not already
+ * queued and leaves it in the same position on the kernel-global
+ * workqueue otherwise.
  */
-int schedule_delayed_work(struct delayed_work *dwork,
-                                       unsigned long delay)
+bool schedule_work(struct work_struct *work)
 {
-       return queue_delayed_work(system_wq, dwork, delay);
+       return queue_work(system_wq, work);
 }
-EXPORT_SYMBOL(schedule_delayed_work);
+EXPORT_SYMBOL(schedule_work);
 
 /**
  * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
@@ -2792,13 +3002,27 @@ EXPORT_SYMBOL(schedule_delayed_work);
  * After waiting for a given time this puts a job in the kernel-global
  * workqueue on the specified CPU.
  */
-int schedule_delayed_work_on(int cpu,
-                       struct delayed_work *dwork, unsigned long delay)
+bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
+                             unsigned long delay)
 {
        return queue_delayed_work_on(cpu, system_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work_on);
 
+/**
+ * schedule_delayed_work - put work task in global workqueue after delay
+ * @dwork: job to be done
+ * @delay: number of jiffies to wait or 0 for immediate execution
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue.
+ */
+bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
+{
+       return queue_delayed_work(system_wq, dwork, delay);
+}
+EXPORT_SYMBOL(schedule_delayed_work);
+
 /**
  * schedule_on_each_cpu - execute a function synchronously on each online CPU
  * @func: the function to call
@@ -2946,9 +3170,8 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
        int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
 
        if (max_active < 1 || max_active > lim)
-               printk(KERN_WARNING "workqueue: max_active %d requested for %s "
-                      "is out of range, clamping between %d and %d\n",
-                      max_active, name, 1, lim);
+               pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
+                       max_active, name, 1, lim);
 
        return clamp_val(max_active, 1, lim);
 }
@@ -2984,13 +3207,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        if (flags & WQ_MEM_RECLAIM)
                flags |= WQ_RESCUER;
 
-       /*
-        * Unbound workqueues aren't concurrency managed and should be
-        * dispatched to workers immediately.
-        */
-       if (flags & WQ_UNBOUND)
-               flags |= WQ_HIGHPRI;
-
        max_active = max_active ?: WQ_DFL_ACTIVE;
        max_active = wq_clamp_max_active(max_active, flags, wq->name);
 
@@ -3011,9 +3227,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        for_each_cwq_cpu(cpu, wq) {
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
                struct global_cwq *gcwq = get_gcwq(cpu);
+               int pool_idx = (bool)(flags & WQ_HIGHPRI);
 
                BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
-               cwq->gcwq = gcwq;
+               cwq->pool = &gcwq->pools[pool_idx];
                cwq->wq = wq;
                cwq->flush_color = -1;
                cwq->max_active = max_active;
@@ -3225,369 +3442,143 @@ EXPORT_SYMBOL_GPL(work_busy);
  * gcwqs serve mix of short, long and very long running works making
  * blocked draining impractical.
  *
- * This is solved by allowing a gcwq to be detached from CPU, running
- * it with unbound (rogue) workers and allowing it to be reattached
- * later if the cpu comes back online.  A separate thread is created
- * to govern a gcwq in such state and is called the trustee of the
- * gcwq.
- *
- * Trustee states and their descriptions.
- *
- * START       Command state used on startup.  On CPU_DOWN_PREPARE, a
- *             new trustee is started with this state.
- *
- * IN_CHARGE   Once started, trustee will enter this state after
- *             assuming the manager role and making all existing
- *             workers rogue.  DOWN_PREPARE waits for trustee to
- *             enter this state.  After reaching IN_CHARGE, trustee
- *             tries to execute the pending worklist until it's empty
- *             and the state is set to BUTCHER, or the state is set
- *             to RELEASE.
- *
- * BUTCHER     Command state which is set by the cpu callback after
- *             the cpu has went down.  Once this state is set trustee
- *             knows that there will be no new works on the worklist
- *             and once the worklist is empty it can proceed to
- *             killing idle workers.
- *
- * RELEASE     Command state which is set by the cpu callback if the
- *             cpu down has been canceled or it has come online
- *             again.  After recognizing this state, trustee stops
- *             trying to drain or butcher and clears ROGUE, rebinds
- *             all remaining workers back to the cpu and releases
- *             manager role.
- *
- * DONE                Trustee will enter this state after BUTCHER or RELEASE
- *             is complete.
- *
- *          trustee                 CPU                draining
- *         took over                down               complete
- * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
- *                        |                     |                  ^
- *                        | CPU is back online  v   return workers |
- *                         ----------------> RELEASE --------------
- */
-
-/**
- * trustee_wait_event_timeout - timed event wait for trustee
- * @cond: condition to wait for
- * @timeout: timeout in jiffies
- *
- * wait_event_timeout() for trustee to use.  Handles locking and
- * checks for RELEASE request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by trustee.
- *
- * RETURNS:
- * Positive indicating left time if @cond is satisfied, 0 if timed
- * out, -1 if canceled.
- */
-#define trustee_wait_event_timeout(cond, timeout) ({                   \
-       long __ret = (timeout);                                         \
-       while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
-              __ret) {                                                 \
-               spin_unlock_irq(&gcwq->lock);                           \
-               __wait_event_timeout(gcwq->trustee_wait, (cond) ||      \
-                       (gcwq->trustee_state == TRUSTEE_RELEASE),       \
-                       __ret);                                         \
-               spin_lock_irq(&gcwq->lock);                             \
-       }                                                               \
-       gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);          \
-})
-
-/**
- * trustee_wait_event - event wait for trustee
- * @cond: condition to wait for
- *
- * wait_event() for trustee to use.  Automatically handles locking and
- * checks for CANCEL request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by trustee.
- *
- * RETURNS:
- * 0 if @cond is satisfied, -1 if canceled.
+ * This is solved by allowing a gcwq to be disassociated from the CPU
+ * running as an unbound one and allowing it to be reattached later if the
+ * cpu comes back online.
  */
-#define trustee_wait_event(cond) ({                                    \
-       long __ret1;                                                    \
-       __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
-       __ret1 < 0 ? -1 : 0;                                            \
-})
 
-static int __cpuinit trustee_thread(void *__gcwq)
+/* claim manager positions of all pools */
+static void gcwq_claim_management_and_lock(struct global_cwq *gcwq)
 {
-       struct global_cwq *gcwq = __gcwq;
-       struct worker *worker;
-       struct work_struct *work;
-       struct hlist_node *pos;
-       long rc;
-       int i;
-
-       BUG_ON(gcwq->cpu != smp_processor_id());
+       struct worker_pool *pool;
 
+       for_each_worker_pool(pool, gcwq)
+               mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools);
        spin_lock_irq(&gcwq->lock);
-       /*
-        * Claim the manager position and make all workers rogue.
-        * Trustee must be bound to the target cpu and can't be
-        * cancelled.
-        */
-       BUG_ON(gcwq->cpu != smp_processor_id());
-       rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
-       BUG_ON(rc < 0);
-
-       gcwq->flags |= GCWQ_MANAGING_WORKERS;
-
-       list_for_each_entry(worker, &gcwq->idle_list, entry)
-               worker->flags |= WORKER_ROGUE;
+}
 
-       for_each_busy_worker(worker, i, pos, gcwq)
-               worker->flags |= WORKER_ROGUE;
+/* release manager positions */
+static void gcwq_release_management_and_unlock(struct global_cwq *gcwq)
+{
+       struct worker_pool *pool;
 
-       /*
-        * Call schedule() so that we cross rq->lock and thus can
-        * guarantee sched callbacks see the rogue flag.  This is
-        * necessary as scheduler callbacks may be invoked from other
-        * cpus.
-        */
        spin_unlock_irq(&gcwq->lock);
-       schedule();
-       spin_lock_irq(&gcwq->lock);
+       for_each_worker_pool(pool, gcwq)
+               mutex_unlock(&pool->manager_mutex);
+}
 
-       /*
-        * Sched callbacks are disabled now.  Zap nr_running.  After
-        * this, nr_running stays zero and need_more_worker() and
-        * keep_working() are always true as long as the worklist is
-        * not empty.
-        */
-       atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
+static void gcwq_unbind_fn(struct work_struct *work)
+{
+       struct global_cwq *gcwq = get_gcwq(smp_processor_id());
+       struct worker_pool *pool;
+       struct worker *worker;
+       struct hlist_node *pos;
+       int i;
 
-       spin_unlock_irq(&gcwq->lock);
-       del_timer_sync(&gcwq->idle_timer);
-       spin_lock_irq(&gcwq->lock);
+       BUG_ON(gcwq->cpu != smp_processor_id());
 
-       /*
-        * We're now in charge.  Notify and proceed to drain.  We need
-        * to keep the gcwq running during the whole CPU down
-        * procedure as other cpu hotunplug callbacks may need to
-        * flush currently running tasks.
-        */
-       gcwq->trustee_state = TRUSTEE_IN_CHARGE;
-       wake_up_all(&gcwq->trustee_wait);
+       gcwq_claim_management_and_lock(gcwq);
 
        /*
-        * The original cpu is in the process of dying and may go away
-        * anytime now.  When that happens, we and all workers would
-        * be migrated to other cpus.  Try draining any left work.  We
-        * want to get it over with ASAP - spam rescuers, wake up as
-        * many idlers as necessary and create new ones till the
-        * worklist is empty.  Note that if the gcwq is frozen, there
-        * may be frozen works in freezable cwqs.  Don't declare
-        * completion while frozen.
+        * We've claimed all manager positions.  Make all workers unbound
+        * and set DISASSOCIATED.  Before this, all workers except for the
+        * ones which are still executing works from before the last CPU
+        * down must be on the cpu.  After this, they may become diasporas.
         */
-       while (gcwq->nr_workers != gcwq->nr_idle ||
-              gcwq->flags & GCWQ_FREEZING ||
-              gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
-               int nr_works = 0;
-
-               list_for_each_entry(work, &gcwq->worklist, entry) {
-                       send_mayday(work);
-                       nr_works++;
-               }
+       for_each_worker_pool(pool, gcwq)
+               list_for_each_entry(worker, &pool->idle_list, entry)
+                       worker->flags |= WORKER_UNBOUND;
 
-               list_for_each_entry(worker, &gcwq->idle_list, entry) {
-                       if (!nr_works--)
-                               break;
-                       wake_up_process(worker->task);
-               }
+       for_each_busy_worker(worker, i, pos, gcwq)
+               worker->flags |= WORKER_UNBOUND;
 
-               if (need_to_create_worker(gcwq)) {
-                       spin_unlock_irq(&gcwq->lock);
-                       worker = create_worker(gcwq, false);
-                       spin_lock_irq(&gcwq->lock);
-                       if (worker) {
-                               worker->flags |= WORKER_ROGUE;
-                               start_worker(worker);
-                       }
-               }
+       gcwq->flags |= GCWQ_DISASSOCIATED;
 
-               /* give a breather */
-               if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
-                       break;
-       }
+       gcwq_release_management_and_unlock(gcwq);
 
        /*
-        * Either all works have been scheduled and cpu is down, or
-        * cpu down has already been canceled.  Wait for and butcher
-        * all workers till we're canceled.
+        * Call schedule() so that we cross rq->lock and thus can guarantee
+        * sched callbacks see the %WORKER_UNBOUND flag.  This is necessary
+        * as scheduler callbacks may be invoked from other cpus.
         */
-       do {
-               rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
-               while (!list_empty(&gcwq->idle_list))
-                       destroy_worker(list_first_entry(&gcwq->idle_list,
-                                                       struct worker, entry));
-       } while (gcwq->nr_workers && rc >= 0);
+       schedule();
 
        /*
-        * At this point, either draining has completed and no worker
-        * is left, or cpu down has been canceled or the cpu is being
-        * brought back up.  There shouldn't be any idle one left.
-        * Tell the remaining busy ones to rebind once it finishes the
-        * currently scheduled works by scheduling the rebind_work.
+        * Sched callbacks are disabled now.  Zap nr_running.  After this,
+        * nr_running stays zero and need_more_worker() and keep_working()
+        * are always true as long as the worklist is not empty.  @gcwq now
+        * behaves as unbound (in terms of concurrency management) gcwq
+        * which is served by workers tied to the CPU.
+        *
+        * On return from this function, the current worker would trigger
+        * unbound chain execution of pending work items if other workers
+        * didn't already.
         */
-       WARN_ON(!list_empty(&gcwq->idle_list));
-
-       for_each_busy_worker(worker, i, pos, gcwq) {
-               struct work_struct *rebind_work = &worker->rebind_work;
-
-               /*
-                * Rebind_work may race with future cpu hotplug
-                * operations.  Use a separate flag to mark that
-                * rebinding is scheduled.
-                */
-               worker->flags |= WORKER_REBIND;
-               worker->flags &= ~WORKER_ROGUE;
-
-               /* queue rebind_work, wq doesn't matter, use the default one */
-               if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
-                                    work_data_bits(rebind_work)))
-                       continue;
-
-               debug_work_activate(rebind_work);
-               insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
-                           worker->scheduled.next,
-                           work_color_to_flags(WORK_NO_COLOR));
-       }
-
-       /* relinquish manager role */
-       gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
-
-       /* notify completion */
-       gcwq->trustee = NULL;
-       gcwq->trustee_state = TRUSTEE_DONE;
-       wake_up_all(&gcwq->trustee_wait);
-       spin_unlock_irq(&gcwq->lock);
-       return 0;
+       for_each_worker_pool(pool, gcwq)
+               atomic_set(get_pool_nr_running(pool), 0);
 }
 
-/**
- * wait_trustee_state - wait for trustee to enter the specified state
- * @gcwq: gcwq the trustee of interest belongs to
- * @state: target state to wait for
- *
- * Wait for the trustee to reach @state.  DONE is already matched.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by cpu_callback.
+/*
+ * Workqueues should be brought up before normal priority CPU notifiers.
+ * This will be registered high priority CPU notifier.
  */
-static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
-__releases(&gcwq->lock)
-__acquires(&gcwq->lock)
-{
-       if (!(gcwq->trustee_state == state ||
-             gcwq->trustee_state == TRUSTEE_DONE)) {
-               spin_unlock_irq(&gcwq->lock);
-               __wait_event(gcwq->trustee_wait,
-                            gcwq->trustee_state == state ||
-                            gcwq->trustee_state == TRUSTEE_DONE);
-               spin_lock_irq(&gcwq->lock);
-       }
-}
-
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
-                                               unsigned long action,
-                                               void *hcpu)
+static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+                                              unsigned long action,
+                                              void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
        struct global_cwq *gcwq = get_gcwq(cpu);
-       struct task_struct *new_trustee = NULL;
-       struct worker *uninitialized_var(new_worker);
-       unsigned long flags;
-
-       action &= ~CPU_TASKS_FROZEN;
+       struct worker_pool *pool;
 
-       switch (action) {
-       case CPU_DOWN_PREPARE:
-               new_trustee = kthread_create(trustee_thread, gcwq,
-                                            "workqueue_trustee/%d\n", cpu);
-               if (IS_ERR(new_trustee))
-                       return notifier_from_errno(PTR_ERR(new_trustee));
-               kthread_bind(new_trustee, cpu);
-               /* fall through */
+       switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_UP_PREPARE:
-               BUG_ON(gcwq->first_idle);
-               new_worker = create_worker(gcwq, false);
-               if (!new_worker) {
-                       if (new_trustee)
-                               kthread_stop(new_trustee);
-                       return NOTIFY_BAD;
-               }
-       }
-
-       /* some are called w/ irq disabled, don't disturb irq status */
-       spin_lock_irqsave(&gcwq->lock, flags);
+               for_each_worker_pool(pool, gcwq) {
+                       struct worker *worker;
 
-       switch (action) {
-       case CPU_DOWN_PREPARE:
-               /* initialize trustee and tell it to acquire the gcwq */
-               BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
-               gcwq->trustee = new_trustee;
-               gcwq->trustee_state = TRUSTEE_START;
-               wake_up_process(gcwq->trustee);
-               wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
-               /* fall through */
-       case CPU_UP_PREPARE:
-               BUG_ON(gcwq->first_idle);
-               gcwq->first_idle = new_worker;
-               break;
+                       if (pool->nr_workers)
+                               continue;
 
-       case CPU_DYING:
-               /*
-                * Before this, the trustee and all workers except for
-                * the ones which are still executing works from
-                * before the last CPU down must be on the cpu.  After
-                * this, they'll all be diasporas.
-                */
-               gcwq->flags |= GCWQ_DISASSOCIATED;
-               break;
+                       worker = create_worker(pool);
+                       if (!worker)
+                               return NOTIFY_BAD;
 
-       case CPU_POST_DEAD:
-               gcwq->trustee_state = TRUSTEE_BUTCHER;
-               /* fall through */
-       case CPU_UP_CANCELED:
-               destroy_worker(gcwq->first_idle);
-               gcwq->first_idle = NULL;
+                       spin_lock_irq(&gcwq->lock);
+                       start_worker(worker);
+                       spin_unlock_irq(&gcwq->lock);
+               }
                break;
 
        case CPU_DOWN_FAILED:
        case CPU_ONLINE:
+               gcwq_claim_management_and_lock(gcwq);
                gcwq->flags &= ~GCWQ_DISASSOCIATED;
-               if (gcwq->trustee_state != TRUSTEE_DONE) {
-                       gcwq->trustee_state = TRUSTEE_RELEASE;
-                       wake_up_process(gcwq->trustee);
-                       wait_trustee_state(gcwq, TRUSTEE_DONE);
-               }
-
-               /*
-                * Trustee is done and there might be no worker left.
-                * Put the first_idle in and request a real manager to
-                * take a look.
-                */
-               spin_unlock_irq(&gcwq->lock);
-               kthread_bind(gcwq->first_idle->task, cpu);
-               spin_lock_irq(&gcwq->lock);
-               gcwq->flags |= GCWQ_MANAGE_WORKERS;
-               start_worker(gcwq->first_idle);
-               gcwq->first_idle = NULL;
+               rebind_workers(gcwq);
+               gcwq_release_management_and_unlock(gcwq);
                break;
        }
+       return NOTIFY_OK;
+}
 
-       spin_unlock_irqrestore(&gcwq->lock, flags);
+/*
+ * Workqueues should be brought down after normal priority CPU notifiers.
+ * This will be registered as low priority CPU notifier.
+ */
+static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
+                                                unsigned long action,
+                                                void *hcpu)
+{
+       unsigned int cpu = (unsigned long)hcpu;
+       struct work_struct unbind_work;
 
-       return notifier_from_errno(0);
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_DOWN_PREPARE:
+               /* unbinding should happen on the local CPU */
+               INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
+               queue_work_on(cpu, system_highpri_wq, &unbind_work);
+               flush_work(&unbind_work);
+               break;
+       }
+       return NOTIFY_OK;
 }
 
 #ifdef CONFIG_SMP
@@ -3746,6 +3737,7 @@ void thaw_workqueues(void)
 
        for_each_gcwq_cpu(cpu) {
                struct global_cwq *gcwq = get_gcwq(cpu);
+               struct worker_pool *pool;
                struct workqueue_struct *wq;
 
                spin_lock_irq(&gcwq->lock);
@@ -3767,7 +3759,8 @@ void thaw_workqueues(void)
                                cwq_activate_first_delayed(cwq);
                }
 
-               wake_up_worker(gcwq);
+               for_each_worker_pool(pool, gcwq)
+                       wake_up_worker(pool);
 
                spin_unlock_irq(&gcwq->lock);
        }
@@ -3783,60 +3776,72 @@ static int __init init_workqueues(void)
        unsigned int cpu;
        int i;
 
-       cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
+       /* make sure we have enough bits for OFFQ CPU number */
+       BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) <
+                    WORK_CPU_LAST);
+
+       cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
+       cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
 
        /* initialize gcwqs */
        for_each_gcwq_cpu(cpu) {
                struct global_cwq *gcwq = get_gcwq(cpu);
+               struct worker_pool *pool;
 
                spin_lock_init(&gcwq->lock);
-               INIT_LIST_HEAD(&gcwq->worklist);
                gcwq->cpu = cpu;
                gcwq->flags |= GCWQ_DISASSOCIATED;
 
-               INIT_LIST_HEAD(&gcwq->idle_list);
                for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
                        INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
 
-               init_timer_deferrable(&gcwq->idle_timer);
-               gcwq->idle_timer.function = idle_worker_timeout;
-               gcwq->idle_timer.data = (unsigned long)gcwq;
+               for_each_worker_pool(pool, gcwq) {
+                       pool->gcwq = gcwq;
+                       INIT_LIST_HEAD(&pool->worklist);
+                       INIT_LIST_HEAD(&pool->idle_list);
 
-               setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
-                           (unsigned long)gcwq);
+                       init_timer_deferrable(&pool->idle_timer);
+                       pool->idle_timer.function = idle_worker_timeout;
+                       pool->idle_timer.data = (unsigned long)pool;
 
-               ida_init(&gcwq->worker_ida);
+                       setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
+                                   (unsigned long)pool);
 
-               gcwq->trustee_state = TRUSTEE_DONE;
-               init_waitqueue_head(&gcwq->trustee_wait);
+                       mutex_init(&pool->manager_mutex);
+                       ida_init(&pool->worker_ida);
+               }
+
+               init_waitqueue_head(&gcwq->rebind_hold);
        }
 
        /* create the initial worker */
        for_each_online_gcwq_cpu(cpu) {
                struct global_cwq *gcwq = get_gcwq(cpu);
-               struct worker *worker;
+               struct worker_pool *pool;
 
                if (cpu != WORK_CPU_UNBOUND)
                        gcwq->flags &= ~GCWQ_DISASSOCIATED;
-               worker = create_worker(gcwq, true);
-               BUG_ON(!worker);
-               spin_lock_irq(&gcwq->lock);
-               start_worker(worker);
-               spin_unlock_irq(&gcwq->lock);
+
+               for_each_worker_pool(pool, gcwq) {
+                       struct worker *worker;
+
+                       worker = create_worker(pool);
+                       BUG_ON(!worker);
+                       spin_lock_irq(&gcwq->lock);
+                       start_worker(worker);
+                       spin_unlock_irq(&gcwq->lock);
+               }
        }
 
        system_wq = alloc_workqueue("events", 0, 0);
+       system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
        system_long_wq = alloc_workqueue("events_long", 0, 0);
-       system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
                                            WQ_UNBOUND_MAX_ACTIVE);
        system_freezable_wq = alloc_workqueue("events_freezable",
                                              WQ_FREEZABLE, 0);
-       system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable",
-                       WQ_NON_REENTRANT | WQ_FREEZABLE, 0);
-       BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
-              !system_unbound_wq || !system_freezable_wq ||
-               !system_nrt_freezable_wq);
+       BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
+              !system_unbound_wq || !system_freezable_wq);
        return 0;
 }
 early_initcall(init_workqueues);