Merge remote-tracking branch 'y2038/y2038'

[karo-tx-linux.git] / kernel / cpuset.c
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index fdf1759bdca5725c952187d28d2e083b42a59cea..c02d677c541c68067f76f0053864ed176ba39ccc 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -474,7 +474,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
  
         /* On legacy hiearchy, we must be a subset of our parent cpuset. */
         ret = -EACCES;
-       if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
+       if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+           !is_cpuset_subset(trial, par))
                 goto out;
  
         /*
@@ -498,7 +499,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
          * be changed to have empty cpus_allowed or mems_allowed.
          */
         ret = -ENOSPC;
-       if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
+       if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
                 if (!cpumask_empty(cur->cpus_allowed) &&
                     cpumask_empty(trial->cpus_allowed))
                         goto out;
@@ -549,9 +550,6 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
  
         rcu_read_lock();
         cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
-               if (cp == root_cs)
-                       continue;
-
                 /* skip the whole subtree if @cp doesn't have any CPU */
                 if (cpumask_empty(cp->cpus_allowed)) {
                         pos_css = css_rightmost_descendant(pos_css);
@@ -626,6 +624,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
         int csn;                /* how many cpuset ptrs in csa so far */
         int i, j, k;            /* indices for partition finding loops */
         cpumask_var_t *doms;    /* resulting partition; i.e. sched domains */
+       cpumask_var_t non_isolated_cpus;  /* load balanced CPUs */
         struct sched_domain_attr *dattr;  /* attributes for custom domains */
         int ndoms = 0;          /* number of sched domains in result */
         int nslot;              /* next empty doms[] struct cpumask slot */
@@ -635,6 +634,10 @@ static int generate_sched_domains(cpumask_var_t **domains,
         dattr = NULL;
         csa = NULL;
  
+       if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
+               goto done;
+       cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+
         /* Special case for the 99% of systems with one, full, sched domain */
         if (is_sched_load_balance(&top_cpuset)) {
                 ndoms = 1;
@@ -647,7 +650,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
                         *dattr = SD_ATTR_INIT;
                         update_domain_attr_tree(dattr, &top_cpuset);
                 }
-               cpumask_copy(doms[0], top_cpuset.effective_cpus);
+               cpumask_and(doms[0], top_cpuset.effective_cpus,
+                                    non_isolated_cpus);
  
                 goto done;
         }
@@ -670,7 +674,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
                  * the corresponding sched domain.
                  */
                 if (!cpumask_empty(cp->cpus_allowed) &&
-                   !is_sched_load_balance(cp))
+                   !(is_sched_load_balance(cp) &&
+                     cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
                         continue;
  
                 if (is_sched_load_balance(cp))
@@ -752,6 +757,7 @@ restart:
  
                         if (apn == b->pn) {
                                 cpumask_or(dp, dp, b->effective_cpus);
+                               cpumask_and(dp, dp, non_isolated_cpus);
                                 if (dattr)
                                         update_domain_attr_tree(dattr + nslot, b);
  
@@ -764,6 +770,7 @@ restart:
         BUG_ON(nslot != ndoms);
  
  done:
+       free_cpumask_var(non_isolated_cpus);
         kfree(csa);
  
         /*
@@ -874,7 +881,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
                  * If it becomes empty, inherit the effective mask of the
                  * parent, which is guaranteed to have some CPUs.
                  */
-               if (cpumask_empty(new_cpus))
+               if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+                   cpumask_empty(new_cpus))
                         cpumask_copy(new_cpus, parent->effective_cpus);
  
                 /* Skip the whole subtree if the cpumask remains the same. */
@@ -891,7 +899,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
                 cpumask_copy(cp->effective_cpus, new_cpus);
                 spin_unlock_irq(&callback_lock);
  
-               WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+               WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
  
                 update_tasks_cpumask(cp);
@@ -1130,7 +1138,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
                  * If it becomes empty, inherit the effective mask of the
                  * parent, which is guaranteed to have some MEMs.
                  */
-               if (nodes_empty(*new_mems))
+               if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+                   nodes_empty(*new_mems))
                         *new_mems = parent->effective_mems;
  
                 /* Skip the whole subtree if the nodemask remains the same. */
@@ -1147,7 +1156,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
                 cp->effective_mems = *new_mems;
                 spin_unlock_irq(&callback_lock);
  
-               WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+               WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
  
                 update_tasks_nodemask(cp);
@@ -1218,7 +1227,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
         spin_unlock_irq(&callback_lock);
  
         /* use trialcs->mems_allowed as a temp variable */
-       update_nodemasks_hier(cs, &cs->mems_allowed);
+       update_nodemasks_hier(cs, &trialcs->mems_allowed);
  done:
         return retval;
  }
@@ -1438,7 +1447,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
  
         /* allow moving tasks into an empty cpuset if on default hierarchy */
         ret = -ENOSPC;
-       if (!cgroup_on_dfl(css->cgroup) &&
+       if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
             (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                 goto out_unlock;
  
@@ -1482,9 +1491,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
  {
         /* static buf protected by cpuset_mutex */
         static nodemask_t cpuset_attach_nodemask_to;
-       struct mm_struct *mm;
         struct task_struct *task;
-       struct task_struct *leader = cgroup_taskset_first(tset);
+       struct task_struct *leader;
         struct cpuset *cs = css_cs(css);
         struct cpuset *oldcs = cpuset_attach_old_cs;
  
@@ -1510,26 +1518,30 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
         }
  
         /*
-        * Change mm, possibly for multiple threads in a threadgroup. This is
-        * expensive and may sleep.
+        * Change mm for all threadgroup leaders. This is expensive and may
+        * sleep and should be moved outside migration path proper.
          */
         cpuset_attach_nodemask_to = cs->effective_mems;
-       mm = get_task_mm(leader);
-       if (mm) {
-               mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
-
-               /*
-                * old_mems_allowed is the same with mems_allowed here, except
-                * if this task is being moved automatically due to hotplug.
-                * In that case @mems_allowed has been updated and is empty,
-                * so @old_mems_allowed is the right nodesets that we migrate
-                * mm from.
-                */
-               if (is_memory_migrate(cs)) {
-                       cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
-                                         &cpuset_attach_nodemask_to);
+       cgroup_taskset_for_each_leader(leader, tset) {
+               struct mm_struct *mm = get_task_mm(leader);
+
+               if (mm) {
+                       mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
+
+                       /*
+                        * old_mems_allowed is the same with mems_allowed
+                        * here, except if this task is being moved
+                        * automatically due to hotplug.  In that case
+                        * @mems_allowed has been updated and is empty, so
+                        * @old_mems_allowed is the right nodesets that we
+                        * migrate mm from.
+                        */
+                       if (is_memory_migrate(cs)) {
+                               cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
+                                                 &cpuset_attach_nodemask_to);
+                       }
+                       mmput(mm);
                 }
-               mmput(mm);
         }
  
         cs->old_mems_allowed = cpuset_attach_nodemask_to;
@@ -1592,9 +1604,6 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
         case FILE_MEMORY_PRESSURE_ENABLED:
                 cpuset_memory_pressure_enabled = !!val;
                 break;
-       case FILE_MEMORY_PRESSURE:
-               retval = -EACCES;
-               break;
         case FILE_SPREAD_PAGE:
                 retval = update_flag(CS_SPREAD_PAGE, cs, val);
                 break;
@@ -1711,40 +1720,27 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
  {
         struct cpuset *cs = css_cs(seq_css(sf));
         cpuset_filetype_t type = seq_cft(sf)->private;
-       ssize_t count;
-       char *buf, *s;
         int ret = 0;
  
-       count = seq_get_buf(sf, &buf);
-       s = buf;
-
         spin_lock_irq(&callback_lock);
  
         switch (type) {
         case FILE_CPULIST:
-               s += cpulist_scnprintf(s, count, cs->cpus_allowed);
+               seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
                 break;
         case FILE_MEMLIST:
-               s += nodelist_scnprintf(s, count, cs->mems_allowed);
+               seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
                 break;
         case FILE_EFFECTIVE_CPULIST:
-               s += cpulist_scnprintf(s, count, cs->effective_cpus);
+               seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
                 break;
         case FILE_EFFECTIVE_MEMLIST:
-               s += nodelist_scnprintf(s, count, cs->effective_mems);
+               seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
                 break;
         default:
                 ret = -EINVAL;
-               goto out_unlock;
         }
  
-       if (s < buf + count - 1) {
-               *s++ = '\n';
-               seq_commit(sf, s - buf);
-       } else {
-               seq_commit(sf, -1);
-       }
-out_unlock:
         spin_unlock_irq(&callback_lock);
         return ret;
  }
@@ -1874,9 +1870,6 @@ static struct cftype files[] = {
         {
                 .name = "memory_pressure",
                 .read_u64 = cpuset_read_u64,
-               .write_u64 = cpuset_write_u64,
-               .private = FILE_MEMORY_PRESSURE,
-               .mode = S_IRUGO,
         },
  
         {
@@ -1963,7 +1956,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
         cpuset_inc();
  
         spin_lock_irq(&callback_lock);
-       if (cgroup_on_dfl(cs->css.cgroup)) {
+       if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
                 cs->effective_mems = parent->effective_mems;
         }
@@ -1996,7 +1989,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
  
         spin_lock_irq(&callback_lock);
         cs->mems_allowed = parent->mems_allowed;
+       cs->effective_mems = parent->mems_allowed;
         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+       cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
         spin_unlock_irq(&callback_lock);
  out_unlock:
         mutex_unlock(&cpuset_mutex);
@@ -2038,7 +2033,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
         mutex_lock(&cpuset_mutex);
         spin_lock_irq(&callback_lock);
  
-       if (cgroup_on_dfl(root_css->cgroup)) {
+       if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
                 top_cpuset.mems_allowed = node_possible_map;
         } else {
@@ -2219,7 +2214,7 @@ retry:
         cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
         mems_updated = !nodes_equal(new_mems, cs->effective_mems);
  
-       if (cgroup_on_dfl(cs->css.cgroup))
+       if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
                 hotplug_update_tasks(cs, &new_cpus, &new_mems,
                                      cpus_updated, mems_updated);
         else
@@ -2250,7 +2245,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
         static cpumask_t new_cpus;
         static nodemask_t new_mems;
         bool cpus_updated, mems_updated;
-       bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
+       bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
  
         mutex_lock(&cpuset_mutex);
  
@@ -2404,7 +2399,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
          */
  }
  
-void cpuset_init_current_mems_allowed(void)
+void __init cpuset_init_current_mems_allowed(void)
  {
         nodes_setall(current->mems_allowed);
  }
@@ -2462,20 +2457,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
   * @node: is this an allowed node?
   * @gfp_mask: memory allocation flags
   *
- * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
- * set, yes, we can always allocate.  If node is in our task's mems_allowed,
- * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest
- * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been
- * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
- * flag, yes.
+ * If we're in interrupt, yes, we can always allocate.  If @node is set in
+ * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
+ * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
+ * yes.  If current has access to memory reserves due to TIF_MEMDIE, yes.
   * Otherwise, no.
   *
- * The __GFP_THISNODE placement logic is really handled elsewhere,
- * by forcibly using a zonelist starting at a specified node, and by
- * (in get_page_from_freelist()) refusing to consider the zones for
- * any node on the zonelist except the first.  By the time any such
- * calls get to this routine, we should just shut up and say 'yes'.
- *
   * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
   * and do not allow allocations outside the current tasks cpuset
   * unless the task has been OOM killed as is marked TIF_MEMDIE.
@@ -2511,7 +2498,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
         int allowed;                    /* is allocation in zone z allowed? */
         unsigned long flags;
  
-       if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
+       if (in_interrupt())
                 return 1;
         if (node_isset(node, current->mems_allowed))
                 return 1;
@@ -2614,8 +2601,6 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
         return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
  }
  
-#define CPUSET_NODELIST_LEN    (256)
-
  /**
   * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
   * @tsk: pointer to task_struct of some task.
@@ -2625,23 +2610,16 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
   */
  void cpuset_print_task_mems_allowed(struct task_struct *tsk)
  {
-        /* Statically allocated to prevent using excess stack. */
-       static char cpuset_nodelist[CPUSET_NODELIST_LEN];
-       static DEFINE_SPINLOCK(cpuset_buffer_lock);
         struct cgroup *cgrp;
  
-       spin_lock(&cpuset_buffer_lock);
         rcu_read_lock();
  
         cgrp = task_cs(tsk)->css.cgroup;
-       nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
-                          tsk->mems_allowed);
         pr_info("%s cpuset=", tsk->comm);
         pr_cont_cgroup_name(cgrp);
-       pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
+       pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
  
         rcu_read_unlock();
-       spin_unlock(&cpuset_buffer_lock);
  }
  
  /*
@@ -2719,10 +2697,8 @@ out:
  /* Display task mems_allowed in /proc/<pid>/status file. */
  void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
  {
-       seq_puts(m, "Mems_allowed:\t");
-       seq_nodemask(m, &task->mems_allowed);
-       seq_puts(m, "\n");
-       seq_puts(m, "Mems_allowed_list:\t");
-       seq_nodemask_list(m, &task->mems_allowed);
-       seq_puts(m, "\n");
+       seq_printf(m, "Mems_allowed:\t%*pb\n",
+                  nodemask_pr_args(&task->mems_allowed));
+       seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
+                  nodemask_pr_args(&task->mems_allowed));
  }