#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <asm/atomic.h>
#include <asm/semaphore.h>
-#define CPUSET_SUPER_MAGIC 0x27e0eb
+#define CPUSET_SUPER_MAGIC 0x27e0eb
+
+/*
+ * Tracks how many cpusets are currently defined in system.
+ * When there is only one cpuset (the root cpuset) we can
+ * short circuit some hooks.
+ */
+int number_of_cpusets __read_mostly;
/* See "Frequency meter" comments, below. */
.count = ATOMIC_INIT(0),
.sibling = LIST_HEAD_INIT(top_cpuset.sibling),
.children = LIST_HEAD_INIT(top_cpuset.children),
- .parent = NULL,
- .dentry = NULL,
- .mems_generation = 0,
};
static struct vfsmount *cpuset_mount;
* a tasks cpuset pointer we use task_lock(), which acts on a spinlock
* (task->alloc_lock) already in the task_struct routinely used for
* such matters.
+ *
+ * P.S. One more locking exception. RCU is used to guard the
+ * update of a tasks cpuset pointer by attach_task() and the
+ * access of task->cpuset->mems_generation via that pointer in
+ * the routine cpuset_update_task_memory_state().
*/
static DECLARE_MUTEX(manage_sem);
spin_lock(&dcache_lock);
node = dentry->d_subdirs.next;
while (node != &dentry->d_subdirs) {
- struct dentry *d = list_entry(node, struct dentry, d_child);
+ struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
list_del_init(node);
if (d->d_inode) {
d = dget_locked(d);
}
node = dentry->d_subdirs.next;
}
- list_del_init(&dentry->d_child);
+ list_del_init(&dentry->d_u.d_child);
spin_unlock(&dcache_lock);
remove_dir(dentry);
}
BUG_ON(!nodes_intersects(*pmask, node_online_map));
}
-/*
- * Refresh current tasks mems_allowed and mems_generation from current
- * tasks cpuset.
+/**
+ * cpuset_update_task_memory_state - update task memory placement
+ *
+ * If the current tasks cpusets mems_allowed changed behind our
+ * backs, update current->mems_allowed, mems_generation and task NUMA
+ * mempolicy to the new value.
*
- * Call without callback_sem or task_lock() held. May be called with
- * or without manage_sem held. Will acquire task_lock() and might
- * acquire callback_sem during call.
+ * Task mempolicy is updated by rebinding it relative to the
+ * current->cpuset if a task has its memory placement changed.
+ * Do not call this routine if in_interrupt().
*
- * The task_lock() is required to dereference current->cpuset safely.
- * Without it, we could pick up the pointer value of current->cpuset
- * in one instruction, and then attach_task could give us a different
- * cpuset, and then the cpuset we had could be removed and freed,
- * and then on our next instruction, we could dereference a no longer
- * valid cpuset pointer to get its mems_generation field.
+ * Call without callback_sem or task_lock() held. May be called
+ * with or without manage_sem held. Doesn't need task_lock to guard
+ * against another task changing a non-NULL cpuset pointer to NULL,
+ * as that is only done by a task on itself, and if the current task
+ * is here, it is not simultaneously in the exit code NULL'ing its
+ * cpuset pointer. This routine also might acquire callback_sem and
+ * current->mm->mmap_sem during call.
+ *
+ * Reading current->cpuset->mems_generation doesn't need task_lock
+ * to guard the current->cpuset derefence, because it is guarded
+ * from concurrent freeing of current->cpuset by attach_task(),
+ * using RCU.
+ *
+ * The rcu_dereference() is technically probably not needed,
+ * as I don't actually mind if I see a new cpuset pointer but
+ * an old value of mems_generation. However this really only
+ * matters on alpha systems using cpusets heavily. If I dropped
+ * that rcu_dereference(), it would save them a memory barrier.
+ * For all other arch's, rcu_dereference is a no-op anyway, and for
+ * alpha systems not using cpusets, another planned optimization,
+ * avoiding the rcu critical section for tasks in the root cpuset
+ * which is statically allocated, so can't vanish, will make this
+ * irrelevant. Better to use RCU as intended, than to engage in
+ * some cute trick to save a memory barrier that is impossible to
+ * test, for alpha systems using cpusets heavily, which might not
+ * even exist.
*
* This routine is needed to update the per-task mems_allowed data,
* within the tasks context, when it is trying to allocate memory
* task has been modifying its cpuset.
*/
-static void refresh_mems(void)
+void cpuset_update_task_memory_state()
{
int my_cpusets_mem_gen;
+ struct task_struct *tsk = current;
+ struct cpuset *cs;
- task_lock(current);
- my_cpusets_mem_gen = current->cpuset->mems_generation;
- task_unlock(current);
-
- if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
- struct cpuset *cs;
- nodemask_t oldmem = current->mems_allowed;
- int migrate;
+ if (tsk->cpuset == &top_cpuset) {
+ /* Don't need rcu for top_cpuset. It's never freed. */
+ my_cpusets_mem_gen = top_cpuset.mems_generation;
+ } else {
+ rcu_read_lock();
+ cs = rcu_dereference(tsk->cpuset);
+ my_cpusets_mem_gen = cs->mems_generation;
+ rcu_read_unlock();
+ }
+ if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
down(&callback_sem);
- task_lock(current);
- cs = current->cpuset;
- migrate = is_memory_migrate(cs);
- guarantee_online_mems(cs, ¤t->mems_allowed);
- current->cpuset_mems_generation = cs->mems_generation;
- task_unlock(current);
+ task_lock(tsk);
+ cs = tsk->cpuset; /* Maybe changed when task not locked */
+ guarantee_online_mems(cs, &tsk->mems_allowed);
+ tsk->cpuset_mems_generation = cs->mems_generation;
+ task_unlock(tsk);
up(&callback_sem);
- if (!nodes_equal(oldmem, current->mems_allowed)) {
- numa_policy_rebind(&oldmem, ¤t->mems_allowed);
- if (migrate) {
- do_migrate_pages(current->mm, &oldmem,
- ¤t->mems_allowed,
- MPOL_MF_MOVE_ALL);
- }
- }
+ mpol_rebind_task(tsk, &tsk->mems_allowed);
}
}
}
/*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset. Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies and if
+ * the cpuset is marked 'memory_migrate', migrate the tasks
+ * pages to the new memory.
+ *
* Call with manage_sem held. May take callback_sem during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
*/
static int update_nodemask(struct cpuset *cs, char *buf)
{
struct cpuset trialcs;
+ nodemask_t oldmem;
+ struct task_struct *g, *p;
+ struct mm_struct **mmarray;
+ int i, n, ntasks;
+ int migrate;
+ int fudge;
int retval;
trialcs = *cs;
retval = nodelist_parse(buf, trialcs.mems_allowed);
if (retval < 0)
- return retval;
+ goto done;
nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
- if (nodes_empty(trialcs.mems_allowed))
- return -ENOSPC;
+ oldmem = cs->mems_allowed;
+ if (nodes_equal(oldmem, trialcs.mems_allowed)) {
+ retval = 0; /* Too easy - nothing to do */
+ goto done;
+ }
+ if (nodes_empty(trialcs.mems_allowed)) {
+ retval = -ENOSPC;
+ goto done;
+ }
retval = validate_change(cs, &trialcs);
- if (retval == 0) {
- down(&callback_sem);
- cs->mems_allowed = trialcs.mems_allowed;
- atomic_inc(&cpuset_mems_generation);
- cs->mems_generation = atomic_read(&cpuset_mems_generation);
- up(&callback_sem);
+ if (retval < 0)
+ goto done;
+
+ down(&callback_sem);
+ cs->mems_allowed = trialcs.mems_allowed;
+ atomic_inc(&cpuset_mems_generation);
+ cs->mems_generation = atomic_read(&cpuset_mems_generation);
+ up(&callback_sem);
+
+ set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
+
+ fudge = 10; /* spare mmarray[] slots */
+ fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
+ retval = -ENOMEM;
+
+ /*
+ * Allocate mmarray[] to hold mm reference for each task
+ * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
+ * tasklist_lock. We could use GFP_ATOMIC, but with a
+ * few more lines of code, we can retry until we get a big
+ * enough mmarray[] w/o using GFP_ATOMIC.
+ */
+ while (1) {
+ ntasks = atomic_read(&cs->count); /* guess */
+ ntasks += fudge;
+ mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
+ if (!mmarray)
+ goto done;
+ write_lock_irq(&tasklist_lock); /* block fork */
+ if (atomic_read(&cs->count) <= ntasks)
+ break; /* got enough */
+ write_unlock_irq(&tasklist_lock); /* try again */
+ kfree(mmarray);
}
+
+ n = 0;
+
+ /* Load up mmarray[] with mm reference for each task in cpuset. */
+ do_each_thread(g, p) {
+ struct mm_struct *mm;
+
+ if (n >= ntasks) {
+ printk(KERN_WARNING
+ "Cpuset mempolicy rebind incomplete.\n");
+ continue;
+ }
+ if (p->cpuset != cs)
+ continue;
+ mm = get_task_mm(p);
+ if (!mm)
+ continue;
+ mmarray[n++] = mm;
+ } while_each_thread(g, p);
+ write_unlock_irq(&tasklist_lock);
+
+ /*
+ * Now that we've dropped the tasklist spinlock, we can
+ * rebind the vma mempolicies of each mm in mmarray[] to their
+ * new cpuset, and release that mm. The mpol_rebind_mm()
+ * call takes mmap_sem, which we couldn't take while holding
+ * tasklist_lock. Forks can happen again now - the mpol_copy()
+ * cpuset_being_rebound check will catch such forks, and rebind
+ * their vma mempolicies too. Because we still hold the global
+ * cpuset manage_sem, we know that no other rebind effort will
+ * be contending for the global variable cpuset_being_rebound.
+ * It's ok if we rebind the same mm twice; mpol_rebind_mm()
+ * is idempotent. Also migrate pages in each mm to new nodes.
+ */
+ migrate = is_memory_migrate(cs);
+ for (i = 0; i < n; i++) {
+ struct mm_struct *mm = mmarray[i];
+
+ mpol_rebind_mm(mm, &cs->mems_allowed);
+ if (migrate) {
+ do_migrate_pages(mm, &oldmem, &cs->mems_allowed,
+ MPOL_MF_MOVE_ALL);
+ }
+ mmput(mm);
+ }
+
+ /* We're done rebinding vma's to this cpusets new mems_allowed. */
+ kfree(mmarray);
+ set_cpuset_being_rebound(NULL);
+ retval = 0;
+done:
return retval;
}
struct cpuset *oldcs;
cpumask_t cpus;
nodemask_t from, to;
+ struct mm_struct *mm;
if (sscanf(pidbuf, "%d", &pid) != 1)
return -EIO;
return -ESRCH;
}
atomic_inc(&cs->count);
- tsk->cpuset = cs;
+ rcu_assign_pointer(tsk->cpuset, cs);
task_unlock(tsk);
guarantee_online_cpus(cs, &cpus);
to = cs->mems_allowed;
up(&callback_sem);
+
+ mm = get_task_mm(tsk);
+ if (mm) {
+ mpol_rebind_mm(mm, &to);
+ mmput(mm);
+ }
+
if (is_memory_migrate(cs))
do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
put_task_struct(tsk);
+ synchronize_rcu();
if (atomic_dec_and_test(&oldcs->count))
check_for_release(oldcs, ppathbuf);
return 0;
/*
* cpuset_create_dir - create a directory for an object.
- * cs: the cpuset we create the directory for.
+ * cs: the cpuset we create the directory for.
* It must have a valid ->parent field
* And we are going to fill its ->dentry field.
* name: The name to give to the cpuset directory. Will be copied.
struct dentry *dentry;
int error;
- down(&dir->d_inode->i_sem);
+ mutex_lock(&dir->d_inode->i_mutex);
dentry = cpuset_get_dentry(dir, cft->name);
if (!IS_ERR(dentry)) {
error = cpuset_create_file(dentry, 0644 | S_IFREG);
dput(dentry);
} else
error = PTR_ERR(dentry);
- up(&dir->d_inode->i_sem);
+ mutex_unlock(&dir->d_inode->i_mutex);
return error;
}
return -ENOMEM;
down(&manage_sem);
- refresh_mems();
+ cpuset_update_task_memory_state();
cs->flags = 0;
if (notify_on_release(parent))
set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
down(&callback_sem);
list_add(&cs->sibling, &cs->parent->children);
+ number_of_cpusets++;
up(&callback_sem);
err = cpuset_create_dir(cs, name, mode);
/*
* Release manage_sem before cpuset_populate_dir() because it
- * will down() this new directory's i_sem and if we race with
+ * will down() this new directory's i_mutex and if we race with
* another mkdir, we might deadlock.
*/
up(&manage_sem);
{
struct cpuset *c_parent = dentry->d_parent->d_fsdata;
- /* the vfs holds inode->i_sem already */
+ /* the vfs holds inode->i_mutex already */
return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
}
struct cpuset *parent;
char *pathbuf = NULL;
- /* the vfs holds both inode->i_sem already */
+ /* the vfs holds both inode->i_mutex already */
down(&manage_sem);
- refresh_mems();
+ cpuset_update_task_memory_state();
if (atomic_read(&cs->count) > 0) {
up(&manage_sem);
return -EBUSY;
spin_unlock(&d->d_lock);
cpuset_d_remove_dir(d);
dput(d);
+ number_of_cpusets--;
up(&callback_sem);
if (list_empty(&parent->children))
check_for_release(parent, &pathbuf);
return 0;
}
+/*
+ * cpuset_init_early - just enough so that the calls to
+ * cpuset_update_task_memory_state() in early init code
+ * are harmless.
+ */
+
+int __init cpuset_init_early(void)
+{
+ struct task_struct *tsk = current;
+
+ tsk->cpuset = &top_cpuset;
+ tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation);
+ return 0;
+}
+
/**
* cpuset_init - initialize cpusets at system boot
*
root->d_inode->i_nlink++;
top_cpuset.dentry = root;
root->d_inode->i_op = &cpuset_dir_inode_operations;
+ number_of_cpusets = 1;
err = cpuset_populate_dir(root);
/* memory_pressure_enabled is in root cpuset only */
if (err == 0)
*
* We don't need to task_lock() this reference to tsk->cpuset,
* because tsk is already marked PF_EXITING, so attach_task() won't
- * mess with it.
+ * mess with it, or task is a failed fork, never visible to attach_task.
**/
void cpuset_exit(struct task_struct *tsk)
{
struct cpuset *cs;
- BUG_ON(!(tsk->flags & PF_EXITING));
-
cs = tsk->cpuset;
tsk->cpuset = NULL;
* tasks cpuset.
**/
-cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
+cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
{
cpumask_t mask;
down(&callback_sem);
- task_lock((struct task_struct *)tsk);
+ task_lock(tsk);
guarantee_online_cpus(tsk->cpuset, &mask);
- task_unlock((struct task_struct *)tsk);
+ task_unlock(tsk);
up(&callback_sem);
return mask;
}
/**
- * cpuset_update_current_mems_allowed - update mems parameters to new values
- *
- * If the current tasks cpusets mems_allowed changed behind our backs,
- * update current->mems_allowed and mems_generation to the new value.
- * Do not call this routine if in_interrupt().
+ * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
*
- * Call without callback_sem or task_lock() held. May be called
- * with or without manage_sem held. Unless exiting, it will acquire
- * task_lock(). Also might acquire callback_sem during call to
- * refresh_mems().
- */
+ * Description: Returns the nodemask_t mems_allowed of the cpuset
+ * attached to the specified @tsk. Guaranteed to return some non-empty
+ * subset of node_online_map, even if this means going outside the
+ * tasks cpuset.
+ **/
-void cpuset_update_current_mems_allowed(void)
+nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
- struct cpuset *cs;
- int need_to_refresh = 0;
+ nodemask_t mask;
- task_lock(current);
- cs = current->cpuset;
- if (!cs)
- goto done;
- if (current->cpuset_mems_generation != cs->mems_generation)
- need_to_refresh = 1;
-done:
- task_unlock(current);
- if (need_to_refresh)
- refresh_mems();
+ down(&callback_sem);
+ task_lock(tsk);
+ guarantee_online_mems(tsk->cpuset, &mask);
+ task_unlock(tsk);
+ up(&callback_sem);
+
+ return mask;
}
/**
* GFP_USER - only nodes in current tasks mems allowed ok.
**/
-int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
int node; /* node that zone z is on */
const struct cpuset *cs; /* current cpuset ancestors */
* cpuset file 'memory_pressure_enabled' in the root cpuset.
*/
-int cpuset_memory_pressure_enabled;
+int cpuset_memory_pressure_enabled __read_mostly;
/**
* cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.