unsigned long targets[MEM_CGROUP_NTARGETS];
};
-struct mem_cgroup_reclaim_iter {
- /*
- * last scanned hierarchy member. Valid only if last_dead_count
- * matches memcg->dead_count of the hierarchy root group.
- */
- struct mem_cgroup *last_visited;
- int last_dead_count;
-
+struct reclaim_iter {
+ struct mem_cgroup *position;
/* scan generation, increased every round-trip */
unsigned int generation;
};
struct lruvec lruvec;
unsigned long lru_size[NR_LRU_LISTS];
- struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+ struct reclaim_iter iter[DEF_PRIORITY + 1];
struct rb_node tree_node; /* RB tree node */
unsigned long usage_in_excess;/* Set to the value by which */
struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
- atomic_t dead_count;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct cg_proto tcp_mem;
#endif
/* internal only representation about the status of kmem accounting. */
enum {
KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
- KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
};
#ifdef CONFIG_MEMCG_KMEM
return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
-static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
-{
- /*
- * Our caller must use css_get() first, because memcg_uncharge_kmem()
- * will call css_put() if it sees the memcg is dead.
- */
- smp_wmb();
- if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
- set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
-}
-
-static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
-{
- return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
- &memcg->kmem_account_flags);
-}
#endif
/* Stuffs for move charges at task migration. */
disarm_kmem_keys(memcg);
}
-static void drain_all_stock_async(struct mem_cgroup *memcg);
-
static struct mem_cgroup_per_zone *
mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
{
return memcg;
}
-/*
- * Returns a next (in a pre-order walk) alive memcg (with elevated css
- * ref. count) or NULL if the whole root's subtree has been visited.
- *
- * helper function to be used by mem_cgroup_iter
- */
-static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
- struct mem_cgroup *last_visited)
-{
- struct cgroup_subsys_state *prev_css, *next_css;
-
- prev_css = last_visited ? &last_visited->css : NULL;
-skip_node:
- next_css = css_next_descendant_pre(prev_css, &root->css);
-
- /*
- * Even if we found a group we have to make sure it is
- * alive. css && !memcg means that the groups should be
- * skipped and we should continue the tree walk.
- * last_visited css is safe to use because it is
- * protected by css_get and the tree walk is rcu safe.
- *
- * We do not take a reference on the root of the tree walk
- * because we might race with the root removal when it would
- * be the only node in the iterated hierarchy and mem_cgroup_iter
- * would end up in an endless loop because it expects that at
- * least one valid node will be returned. Root cannot disappear
- * because caller of the iterator should hold it already so
- * skipping css reference should be safe.
- */
- if (next_css) {
- struct mem_cgroup *memcg = mem_cgroup_from_css(next_css);
-
- if (next_css == &root->css)
- return memcg;
-
- if (css_tryget_online(next_css)) {
- /*
- * Make sure the memcg is initialized:
- * mem_cgroup_css_online() orders the the
- * initialization against setting the flag.
- */
- if (smp_load_acquire(&memcg->initialized))
- return memcg;
- css_put(next_css);
- }
-
- prev_css = next_css;
- goto skip_node;
- }
-
- return NULL;
-}
-
-static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
-{
- /*
- * When a group in the hierarchy below root is destroyed, the
- * hierarchy iterator can no longer be trusted since it might
- * have pointed to the destroyed group. Invalidate it.
- */
- atomic_inc(&root->dead_count);
-}
-
-static struct mem_cgroup *
-mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
- struct mem_cgroup *root,
- int *sequence)
-{
- struct mem_cgroup *position = NULL;
- /*
- * A cgroup destruction happens in two stages: offlining and
- * release. They are separated by a RCU grace period.
- *
- * If the iterator is valid, we may still race with an
- * offlining. The RCU lock ensures the object won't be
- * released, tryget will fail if we lost the race.
- */
- *sequence = atomic_read(&root->dead_count);
- if (iter->last_dead_count == *sequence) {
- smp_rmb();
- position = iter->last_visited;
-
- /*
- * We cannot take a reference to root because we might race
- * with root removal and returning NULL would end up in
- * an endless loop on the iterator user level when root
- * would be returned all the time.
- */
- if (position && position != root &&
- !css_tryget_online(&position->css))
- position = NULL;
- }
- return position;
-}
-
-static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
- struct mem_cgroup *last_visited,
- struct mem_cgroup *new_position,
- struct mem_cgroup *root,
- int sequence)
-{
- /* root reference counting symmetric to mem_cgroup_iter_load */
- if (last_visited && last_visited != root)
- css_put(&last_visited->css);
- /*
- * We store the sequence count from the time @last_visited was
- * loaded successfully instead of rereading it here so that we
- * don't lose destruction events in between. We could have
- * raced with the destruction of @new_position after all.
- */
- iter->last_visited = new_position;
- smp_wmb();
- iter->last_dead_count = sequence;
-}
-
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
{
+ struct reclaim_iter *uninitialized_var(iter);
+ struct cgroup_subsys_state *css = NULL;
struct mem_cgroup *memcg = NULL;
- struct mem_cgroup *last_visited = NULL;
+ struct mem_cgroup *pos = NULL;
if (mem_cgroup_disabled())
return NULL;
root = root_mem_cgroup;
if (prev && !reclaim)
- last_visited = prev;
+ pos = prev;
if (!root->use_hierarchy && root != root_mem_cgroup) {
if (prev)
- goto out_css_put;
+ goto out;
return root;
}
rcu_read_lock();
- while (!memcg) {
- struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
- int uninitialized_var(seq);
-
- if (reclaim) {
- struct mem_cgroup_per_zone *mz;
-
- mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
- iter = &mz->reclaim_iter[reclaim->priority];
- if (prev && reclaim->generation != iter->generation) {
- iter->last_visited = NULL;
- goto out_unlock;
- }
- last_visited = mem_cgroup_iter_load(iter, root, &seq);
+ if (reclaim) {
+ struct mem_cgroup_per_zone *mz;
+
+ mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
+ iter = &mz->iter[reclaim->priority];
+
+ if (prev && reclaim->generation != iter->generation)
+ goto out_unlock;
+
+ do {
+ pos = ACCESS_ONCE(iter->position);
+ /*
+ * A racing update may change the position and
+ * put the last reference, hence css_tryget(),
+ * or retry to see the updated position.
+ */
+ } while (pos && !css_tryget(&pos->css));
+ }
+
+ if (pos)
+ css = &pos->css;
+
+ for (;;) {
+ css = css_next_descendant_pre(css, &root->css);
+ if (!css) {
+ /*
+ * Reclaimers share the hierarchy walk, and a
+ * new one might jump in right at the end of
+ * the hierarchy - make sure they see at least
+ * one group and restart from the beginning.
+ */
+ if (!prev)
+ continue;
+ break;
}
- memcg = __mem_cgroup_iter_next(root, last_visited);
+ /*
+ * Verify the css and acquire a reference. The root
+ * is provided by the caller, so we know it's alive
+ * and kicking, and don't take an extra reference.
+ */
+ memcg = mem_cgroup_from_css(css);
+
+ if (css == &root->css)
+ break;
- if (reclaim) {
- mem_cgroup_iter_update(iter, last_visited, memcg, root,
- seq);
+ if (css_tryget(css)) {
+ /*
+ * Make sure the memcg is initialized:
+ * mem_cgroup_css_online() orders the the
+ * initialization against setting the flag.
+ */
+ if (smp_load_acquire(&memcg->initialized))
+ break;
- if (!memcg)
- iter->generation++;
- else if (!prev && memcg)
- reclaim->generation = iter->generation;
+ css_put(css);
}
- if (prev && !memcg)
- goto out_unlock;
+ memcg = NULL;
}
+
+ if (reclaim) {
+ if (cmpxchg(&iter->position, pos, memcg) == pos) {
+ if (memcg)
+ css_get(&memcg->css);
+ if (pos)
+ css_put(&pos->css);
+ }
+
+ /*
+ * pairs with css_tryget when dereferencing iter->position
+ * above.
+ */
+ if (pos)
+ css_put(&pos->css);
+
+ if (!memcg)
+ iter->generation++;
+ else if (!prev)
+ reclaim->generation = iter->generation;
+ }
+
out_unlock:
rcu_read_unlock();
-out_css_put:
+out:
if (prev && prev != root)
css_put(&prev->css);
page_counter_uncharge(&old->memory, stock->nr_pages);
if (do_swap_account)
page_counter_uncharge(&old->memsw, stock->nr_pages);
+ css_put_many(&old->css, stock->nr_pages);
stock->nr_pages = 0;
}
stock->cached = NULL;
/*
* Drains all per-CPU charge caches for given root_memcg resp. subtree
- * of the hierarchy under it. sync flag says whether we should block
- * until the work is done.
+ * of the hierarchy under it.
*/
-static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
+static void drain_all_stock(struct mem_cgroup *root_memcg)
{
int cpu, curcpu;
+ /* If someone's already draining, avoid adding running more workers. */
+ if (!mutex_trylock(&percpu_charge_mutex))
+ return;
/* Notify other cpus that system-wide "drain" is running */
get_online_cpus();
curcpu = get_cpu();
}
}
put_cpu();
-
- if (!sync)
- goto out;
-
- for_each_online_cpu(cpu) {
- struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
- if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
- flush_work(&stock->work);
- }
-out:
put_online_cpus();
-}
-
-/*
- * Tries to drain stocked charges in other cpus. This function is asynchronous
- * and just put a work per cpu for draining localy on each cpu. Caller can
- * expects some charges will be back later but cannot wait for it.
- */
-static void drain_all_stock_async(struct mem_cgroup *root_memcg)
-{
- /*
- * If someone calls draining, avoid adding more kworker runs.
- */
- if (!mutex_trylock(&percpu_charge_mutex))
- return;
- drain_all_stock(root_memcg, false);
- mutex_unlock(&percpu_charge_mutex);
-}
-
-/* This is a synchronous drain interface. */
-static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
-{
- /* called when force_empty is called */
- mutex_lock(&percpu_charge_mutex);
- drain_all_stock(root_memcg, true);
mutex_unlock(&percpu_charge_mutex);
}
goto retry;
if (!drained) {
- drain_all_stock_async(mem_over_limit);
+ drain_all_stock(mem_over_limit);
drained = true;
goto retry;
}
return -EINTR;
done_restock:
+ css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
done:
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_swap_account)
page_counter_uncharge(&memcg->memsw, nr_pages);
+
+ css_put_many(&memcg->css, nr_pages);
}
/*
page_counter_charge(&memcg->memory, nr_pages);
if (do_swap_account)
page_counter_charge(&memcg->memsw, nr_pages);
+ css_get_many(&memcg->css, nr_pages);
ret = 0;
} else if (ret)
page_counter_uncharge(&memcg->kmem, nr_pages);
if (do_swap_account)
page_counter_uncharge(&memcg->memsw, nr_pages);
- /* Not down to 0 */
- if (page_counter_uncharge(&memcg->kmem, nr_pages))
- return;
+ page_counter_uncharge(&memcg->kmem, nr_pages);
- /*
- * Releases a reference taken in kmem_cgroup_css_offline in case
- * this last uncharge is racing with the offlining code or it is
- * outliving the memcg existence.
- *
- * The memory barrier imposed by test&clear is paired with the
- * explicit one in memcg_kmem_mark_dead().
- */
- if (memcg_kmem_test_and_clear_dead(memcg))
- css_put(&memcg->css);
+ css_put_many(&memcg->css, nr_pages);
}
/*
return ret;
}
-/**
- * mem_cgroup_move_parent - moves page to the parent group
- * @page: the page to move
- * @pc: page_cgroup of the page
- * @child: page's cgroup
- *
- * move charges to its parent or the root cgroup if the group has no
- * parent (aka use_hierarchy==0).
- * Although this might fail (get_page_unless_zero, isolate_lru_page or
- * mem_cgroup_move_account fails) the failure is always temporary and
- * it signals a race with a page removal/uncharge or migration. In the
- * first case the page is on the way out and it will vanish from the LRU
- * on the next attempt and the call should be retried later.
- * Isolation from the LRU fails only if page has been isolated from
- * the LRU since we looked at it and that usually means either global
- * reclaim or migration going on. The page will either get back to the
- * LRU or vanish.
- * Finaly mem_cgroup_move_account fails only if the page got uncharged
- * (!PageCgroupUsed) or moved to a different group. The page will
- * disappear in the next attempt.
- */
-static int mem_cgroup_move_parent(struct page *page,
- struct page_cgroup *pc,
- struct mem_cgroup *child)
-{
- struct mem_cgroup *parent;
- unsigned int nr_pages;
- unsigned long uninitialized_var(flags);
- int ret;
-
- VM_BUG_ON(mem_cgroup_is_root(child));
-
- ret = -EBUSY;
- if (!get_page_unless_zero(page))
- goto out;
- if (isolate_lru_page(page))
- goto put;
-
- nr_pages = hpage_nr_pages(page);
-
- parent = parent_mem_cgroup(child);
- /*
- * If no parent, move charges to root cgroup.
- */
- if (!parent)
- parent = root_mem_cgroup;
-
- if (nr_pages > 1) {
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- flags = compound_lock_irqsave(page);
- }
-
- ret = mem_cgroup_move_account(page, nr_pages,
- pc, child, parent);
- if (!ret) {
- /* Take charge off the local counters */
- page_counter_cancel(&child->memory, nr_pages);
- if (do_swap_account)
- page_counter_cancel(&child->memsw, nr_pages);
- }
-
- if (nr_pages > 1)
- compound_unlock_irqrestore(page, flags);
- putback_lru_page(page);
-put:
- put_page(page);
-out:
- return ret;
-}
-
#ifdef CONFIG_MEMCG_SWAP
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
return nr_reclaimed;
}
-/**
- * mem_cgroup_force_empty_list - clears LRU of a group
- * @memcg: group to clear
- * @node: NUMA node
- * @zid: zone id
- * @lru: lru to to clear
- *
- * Traverse a specified page_cgroup list and try to drop them all. This doesn't
- * reclaim the pages page themselves - pages are moved to the parent (or root)
- * group.
- */
-static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
- int node, int zid, enum lru_list lru)
-{
- struct lruvec *lruvec;
- unsigned long flags;
- struct list_head *list;
- struct page *busy;
- struct zone *zone;
-
- zone = &NODE_DATA(node)->node_zones[zid];
- lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- list = &lruvec->lists[lru];
-
- busy = NULL;
- do {
- struct page_cgroup *pc;
- struct page *page;
-
- spin_lock_irqsave(&zone->lru_lock, flags);
- if (list_empty(list)) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- break;
- }
- page = list_entry(list->prev, struct page, lru);
- if (busy == page) {
- list_move(&page->lru, list);
- busy = NULL;
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- continue;
- }
- spin_unlock_irqrestore(&zone->lru_lock, flags);
-
- pc = lookup_page_cgroup(page);
-
- if (mem_cgroup_move_parent(page, pc, memcg)) {
- /* found lock contention or "pc" is obsolete. */
- busy = page;
- } else
- busy = NULL;
- cond_resched();
- } while (!list_empty(list));
-}
-
-/*
- * make mem_cgroup's charge to be 0 if there is no task by moving
- * all the charges and pages to the parent.
- * This enables deleting this mem_cgroup.
- *
- * Caller is responsible for holding css reference on the memcg.
- */
-static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
-{
- int node, zid;
-
- do {
- /* This is for making all *used* pages to be on LRU. */
- lru_add_drain_all();
- drain_all_stock_sync(memcg);
- mem_cgroup_start_move(memcg);
- for_each_node_state(node, N_MEMORY) {
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- enum lru_list lru;
- for_each_lru(lru) {
- mem_cgroup_force_empty_list(memcg,
- node, zid, lru);
- }
- }
- }
- mem_cgroup_end_move(memcg);
- memcg_oom_recover(memcg);
- cond_resched();
-
- /*
- * Kernel memory may not necessarily be trackable to a specific
- * process. So they are not migrated, and therefore we can't
- * expect their value to drop to 0 here.
- * Having res filled up with kmem only is enough.
- *
- * This is a safety check because mem_cgroup_force_empty_list
- * could have raced with mem_cgroup_replace_page_cache callers
- * so the lru seemed empty but the page could have been added
- * right after the check. RES_USAGE should be safe as we always
- * charge before adding to the LRU.
- */
- } while (page_counter_read(&memcg->memory) -
- page_counter_read(&memcg->kmem) > 0);
-}
-
/*
* Test whether @memcg has children, dead or alive. Note that this
* function doesn't care whether @memcg has use_hierarchy enabled and
{
mem_cgroup_sockets_destroy(memcg);
}
-
-static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
-{
- if (!memcg_kmem_is_active(memcg))
- return;
-
- /*
- * kmem charges can outlive the cgroup. In the case of slab
- * pages, for instance, a page contain objects from various
- * processes. As we prevent from taking a reference for every
- * such allocation we have to be careful when doing uncharge
- * (see memcg_uncharge_kmem) and here during offlining.
- *
- * The idea is that that only the _last_ uncharge which sees
- * the dead memcg will drop the last reference. An additional
- * reference is taken here before the group is marked dead
- * which is then paired with css_put during uncharge resp. here.
- *
- * Although this might sound strange as this path is called from
- * css_offline() when the referencemight have dropped down to 0 and
- * shouldn't be incremented anymore (css_tryget_online() would
- * fail) we do not have other options because of the kmem
- * allocations lifetime.
- */
- css_get(&memcg->css);
-
- memcg_kmem_mark_dead(memcg);
-
- if (page_counter_read(&memcg->kmem))
- return;
-
- if (memcg_kmem_test_and_clear_dead(memcg))
- css_put(&memcg->css);
-}
#else
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
}
-
-static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
-{
-}
#endif
/*
return 0;
}
-/*
- * Announce all parents that a group from their hierarchy is gone.
- */
-static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
-{
- struct mem_cgroup *parent = memcg;
-
- while ((parent = parent_mem_cgroup(parent)))
- mem_cgroup_iter_invalidate(parent);
-
- /*
- * if the root memcg is not hierarchical we have to check it
- * explicitely.
- */
- if (!root_mem_cgroup->use_hierarchy)
- mem_cgroup_iter_invalidate(root_mem_cgroup);
-}
-
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_event *event, *tmp;
- struct cgroup_subsys_state *iter;
/*
* Unregister events and notify userspace.
}
spin_unlock(&memcg->event_list_lock);
- kmem_cgroup_css_offline(memcg);
-
- mem_cgroup_invalidate_reclaim_iterators(memcg);
-
- /*
- * This requires that offlining is serialized. Right now that is
- * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
- */
- css_for_each_descendant_post(iter, css)
- mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
-
memcg_unregister_all_caches(memcg);
vmpressure_cleanup(&memcg->vmpressure);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- /*
- * XXX: css_offline() would be where we should reparent all
- * memory to prepare the cgroup for destruction. However,
- * memcg does not do css_tryget_online() and page_counter charging
- * under the same RCU lock region, which means that charging
- * could race with offlining. Offlining only happens to
- * cgroups with no tasks in them but charges can show up
- * without any tasks from the swapin path when the target
- * memcg is looked up from the swapout record and not from the
- * current task as it usually is. A race like this can leak
- * charges and put pages with stale cgroup pointers into
- * circulation:
- *
- * #0 #1
- * lookup_swap_cgroup_id()
- * rcu_read_lock()
- * mem_cgroup_lookup()
- * css_tryget_online()
- * rcu_read_unlock()
- * disable css_tryget_online()
- * call_rcu()
- * offline_css()
- * reparent_charges()
- * page_counter_try_charge()
- * css_put()
- * css_free()
- * pc->mem_cgroup = dead memcg
- * add page to lru
- *
- * The bulk of the charges are still moved in offline_css() to
- * avoid pinning a lot of pages in case a long-term reference
- * like a swapout record is deferring the css_free() to long
- * after offlining. But this makes sure we catch any charges
- * made after offlining:
- */
- mem_cgroup_reparent_charges(memcg);
memcg_destroy_kmem(memcg);
__mem_cgroup_free(memcg);
{
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
- int i;
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
if (!mem_cgroup_is_root(mc.to))
page_counter_uncharge(&mc.to->memory, mc.moved_swap);
- for (i = 0; i < mc.moved_swap; i++)
- css_put(&mc.from->css);
+ css_put_many(&mc.from->css, mc.moved_swap);
/* we've already done css_get(mc.to) */
mc.moved_swap = 0;
__this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
memcg_check_events(memcg, dummy_page);
local_irq_restore(flags);
+
+ if (!mem_cgroup_is_root(memcg))
+ css_put_many(&memcg->css, max(nr_mem, nr_memsw));
}
static void uncharge_list(struct list_head *page_list)