unsigned long moved_charge;
unsigned long moved_swap;
struct task_struct *moving_task; /* a task moving charges */
- struct mm_struct *mm;
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
return;
VM_BUG_ON(list_empty(&pc->lru));
list_del_init(&pc->lru);
- return;
}
void mem_cgroup_del_lru(struct page *page)
case 0:
list_move(&page->lru, dst);
mem_cgroup_del_lru(page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
/* we don't affect global LRU but rotate in our LRU */
u64 limit;
u64 memsw;
- limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
- total_swap_pages;
+ limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ limit += total_swap_pages << PAGE_SHIFT;
+
memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
* If memsw is finite and limits the amount of swap space available
* possibility of race condition. If there is, we take a lock.
*/
-static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
+void mem_cgroup_update_page_stat(struct page *page,
+ enum mem_cgroup_page_stat_item idx, int val)
{
struct mem_cgroup *mem;
struct page_cgroup *pc = lookup_page_cgroup(page);
bool need_unlock = false;
+ unsigned long uninitialized_var(flags);
if (unlikely(!pc))
return;
/* pc->mem_cgroup is unstable ? */
if (unlikely(mem_cgroup_stealed(mem))) {
/* take a lock against to access pc->mem_cgroup */
- lock_page_cgroup(pc);
+ move_lock_page_cgroup(pc, &flags);
need_unlock = true;
mem = pc->mem_cgroup;
if (!mem || !PageCgroupUsed(pc))
goto out;
}
- this_cpu_add(mem->stat->count[idx], val);
-
switch (idx) {
- case MEM_CGROUP_STAT_FILE_MAPPED:
+ case MEMCG_NR_FILE_MAPPED:
if (val > 0)
SetPageCgroupFileMapped(pc);
else if (!page_mapped(page))
ClearPageCgroupFileMapped(pc);
+ idx = MEM_CGROUP_STAT_FILE_MAPPED;
break;
default:
BUG();
}
+ this_cpu_add(mem->stat->count[idx], val);
+
out:
if (unlikely(need_unlock))
- unlock_page_cgroup(pc);
+ move_unlock_page_cgroup(pc, &flags);
rcu_read_unlock();
return;
}
-
-void mem_cgroup_update_file_mapped(struct page *page, int val)
-{
- mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
-}
+EXPORT_SYMBOL(mem_cgroup_update_page_stat);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* oom-killer can be invoked.
*/
static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+ gfp_t gfp_mask,
+ struct mem_cgroup **memcg, bool oom,
+ int page_size)
{
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem = NULL;
int ret;
- int csize = CHARGE_SIZE;
+ int csize = max(CHARGE_SIZE, (unsigned long) page_size);
/*
* Unlike gloval-vm's OOM-kill, we're not in memory shortage
VM_BUG_ON(css_is_removed(&mem->css));
if (mem_cgroup_is_root(mem))
goto done;
- if (consume_stock(mem))
+ if (page_size == PAGE_SIZE && consume_stock(mem))
goto done;
css_get(&mem->css);
} else {
rcu_read_unlock();
goto done;
}
- if (consume_stock(mem)) {
+ if (page_size == PAGE_SIZE && consume_stock(mem)) {
/*
* It seems dagerous to access memcg without css_get().
* But considering how consume_stok works, it's not
case CHARGE_OK:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
- csize = PAGE_SIZE;
+ csize = page_size;
css_put(&mem->css);
mem = NULL;
goto again;
}
} while (ret != CHARGE_OK);
- if (csize > PAGE_SIZE)
- refill_stock(mem, csize - PAGE_SIZE);
+ if (csize > page_size)
+ refill_stock(mem, csize - page_size);
css_put(&mem->css);
done:
*memcg = mem;
}
}
-static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
+static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
+ int page_size)
{
- __mem_cgroup_cancel_charge(mem, 1);
+ __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
}
/*
* commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
* USED state. If already USED, uncharge and return.
*/
-
-static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
- struct page_cgroup *pc,
- enum charge_type ctype)
+static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
+ struct page_cgroup *pc,
+ enum charge_type ctype)
{
- /* try_charge() can return NULL to *memcg, taking care of it. */
- if (!mem)
- return;
-
- lock_page_cgroup(pc);
- if (unlikely(PageCgroupUsed(pc))) {
- unlock_page_cgroup(pc);
- mem_cgroup_cancel_charge(mem);
- return;
- }
-
pc->mem_cgroup = mem;
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
}
mem_cgroup_charge_statistics(mem, pc, true);
+}
+
+static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+ struct page_cgroup *pc,
+ enum charge_type ctype,
+ int page_size)
+{
+ int i;
+ int count = page_size >> PAGE_SHIFT;
+
+ /* try_charge() can return NULL to *memcg, taking care of it. */
+ if (!mem)
+ return;
+
+ lock_page_cgroup(pc);
+ if (unlikely(PageCgroupUsed(pc))) {
+ unlock_page_cgroup(pc);
+ mem_cgroup_cancel_charge(mem, page_size);
+ return;
+ }
+
+ /*
+ * we don't need page_cgroup_lock about tail pages, becase they are not
+ * accessed by any other context at this point.
+ */
+ for (i = 0; i < count; i++)
+ ____mem_cgroup_commit_charge(mem, pc + i, ctype);
unlock_page_cgroup(pc);
/*
mem_cgroup_charge_statistics(from, pc, false);
if (uncharge)
/* This is not "cancel", but cancel_charge does all we need. */
- mem_cgroup_cancel_charge(from);
+ mem_cgroup_cancel_charge(from, PAGE_SIZE);
/* caller should have done css_get */
pc->mem_cgroup = to;
struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
{
int ret = -EINVAL;
+ unsigned long flags;
+
lock_page_cgroup(pc);
if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
+ move_lock_page_cgroup(pc, &flags);
__mem_cgroup_move_account(pc, from, to, uncharge);
+ move_unlock_page_cgroup(pc, &flags);
ret = 0;
}
unlock_page_cgroup(pc);
goto put;
parent = mem_cgroup_from_cont(pcg);
- ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+ ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false,
+ PAGE_SIZE);
if (ret || !parent)
goto put_back;
ret = mem_cgroup_move_account(pc, child, parent, true);
if (ret)
- mem_cgroup_cancel_charge(parent);
+ mem_cgroup_cancel_charge(parent, PAGE_SIZE);
put_back:
putback_lru_page(page);
put:
struct mem_cgroup *mem = NULL;
struct page_cgroup *pc;
int ret;
+ int page_size = PAGE_SIZE;
+
+ if (PageTransHuge(page)) {
+ page_size <<= compound_order(page);
+ VM_BUG_ON(!PageTransHuge(page));
+ }
pc = lookup_page_cgroup(page);
/* can happen at boot */
return 0;
prefetchw(pc);
- ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
if (ret || !mem)
return ret;
- __mem_cgroup_commit_charge(mem, pc, ctype);
+ __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
return 0;
}
{
if (mem_cgroup_disabled())
return 0;
- if (PageCompound(page))
- return 0;
/*
* If already mapped, we don't have to account.
* If page cache, page->mapping has address_space.
if (!mem)
goto charge_cur_mm;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+ ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE);
css_put(&mem->css);
return ret;
charge_cur_mm:
if (unlikely(!mm))
mm = &init_mm;
- return __mem_cgroup_try_charge(mm, mask, ptr, true);
+ return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE);
}
static void
cgroup_exclude_rmdir(&ptr->css);
pc = lookup_page_cgroup(page);
mem_cgroup_lru_del_before_commit_swapcache(page);
- __mem_cgroup_commit_charge(ptr, pc, ctype);
+ __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
mem_cgroup_lru_add_after_commit_swapcache(page);
/*
* Now swap is on-memory. This means this page may be
return;
if (!mem)
return;
- mem_cgroup_cancel_charge(mem);
+ mem_cgroup_cancel_charge(mem, PAGE_SIZE);
}
static void
-__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
+ int page_size)
{
struct memcg_batch_info *batch = NULL;
bool uncharge_memsw = true;
if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
goto direct_uncharge;
+ if (page_size != PAGE_SIZE)
+ goto direct_uncharge;
+
/*
* In typical case, batch->memcg == mem. This means we can
* merge a series of uncharges to an uncharge of res_counter.
batch->memsw_bytes += PAGE_SIZE;
return;
direct_uncharge:
- res_counter_uncharge(&mem->res, PAGE_SIZE);
+ res_counter_uncharge(&mem->res, page_size);
if (uncharge_memsw)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+ res_counter_uncharge(&mem->memsw, page_size);
if (unlikely(batch->memcg != mem))
memcg_oom_recover(mem);
return;
static struct mem_cgroup *
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
{
+ int i;
+ int count;
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
+ int page_size = PAGE_SIZE;
if (mem_cgroup_disabled())
return NULL;
if (PageSwapCache(page))
return NULL;
+ if (PageTransHuge(page)) {
+ page_size <<= compound_order(page);
+ VM_BUG_ON(!PageTransHuge(page));
+ }
+
+ count = page_size >> PAGE_SHIFT;
/*
* Check if our page_cgroup is valid
*/
break;
}
- mem_cgroup_charge_statistics(mem, pc, false);
+ for (i = 0; i < count; i++)
+ mem_cgroup_charge_statistics(mem, pc + i, false);
ClearPageCgroupUsed(pc);
/*
mem_cgroup_get(mem);
}
if (!mem_cgroup_is_root(mem))
- __do_uncharge(mem, ctype);
+ __do_uncharge(mem, ctype, page_size);
return mem;
enum charge_type ctype;
int ret = 0;
+ VM_BUG_ON(PageTransHuge(page));
if (mem_cgroup_disabled())
return 0;
return 0;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE);
css_put(&mem->css);/* drop extra refcnt */
if (ret || *ptr == NULL) {
if (PageAnon(page)) {
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
- __mem_cgroup_commit_charge(mem, pc, ctype);
+ __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
return ret;
}
/* remove redundant charge if migration failed*/
void mem_cgroup_end_migration(struct mem_cgroup *mem,
- struct page *oldpage, struct page *newpage)
+ struct page *oldpage, struct page *newpage, bool migration_ok)
{
struct page *used, *unused;
struct page_cgroup *pc;
return;
/* blocks rmdir() */
cgroup_exclude_rmdir(&mem->css);
- /* at migration success, oldpage->mapping is NULL. */
- if (oldpage->mapping) {
+ if (!migration_ok) {
used = oldpage;
unused = newpage;
} else {
*/
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
- pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+ pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
if (!pn)
return 1;
mem->info.nodeinfo[node] = pn;
- memset(pn, 0, sizeof(*pn));
-
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
for_each_lru(l)
/* Can be very big if MAX_NUMNODES is very big */
if (size < PAGE_SIZE)
- mem = kmalloc(size, GFP_KERNEL);
+ mem = kzalloc(size, GFP_KERNEL);
else
- mem = vmalloc(size);
+ mem = vzalloc(size);
if (!mem)
return NULL;
- memset(mem, 0, size);
mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
if (!mem->stat)
goto out_free;
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
+ PAGE_SIZE);
if (ret || !mem)
/* mem_cgroup_clear_mc() will do uncharge later */
return -ENOMEM;
pte_t *pte;
spinlock_t *ptl;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
if (is_target_pte_for_mc(vma, addr, *pte, NULL))
unsigned long precharge;
struct vm_area_struct *vma;
- /* We've already held the mmap_sem */
+ down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
struct mm_walk mem_cgroup_count_precharge_walk = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_count_precharge_walk);
}
+ up_read(&mm->mmap_sem);
precharge = mc.precharge;
mc.precharge = 0;
static int mem_cgroup_precharge_mc(struct mm_struct *mm)
{
- return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
+ unsigned long precharge = mem_cgroup_count_precharge(mm);
+
+ VM_BUG_ON(mc.moving_task);
+ mc.moving_task = current;
+ return mem_cgroup_do_precharge(precharge);
}
-static void mem_cgroup_clear_mc(void)
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
{
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
PAGE_SIZE * mc.moved_swap);
}
/* we've already done mem_cgroup_get(mc.to) */
-
mc.moved_swap = 0;
}
- if (mc.mm) {
- up_read(&mc.mm->mmap_sem);
- mmput(mc.mm);
- }
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
+ wake_up_all(&mc.waitq);
+}
+
+static void mem_cgroup_clear_mc(void)
+{
+ struct mem_cgroup *from = mc.from;
+
+ /*
+ * we must clear moving_task before waking up waiters at the end of
+ * task migration.
+ */
+ mc.moving_task = NULL;
+ __mem_cgroup_clear_mc();
spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
spin_unlock(&mc.lock);
- mc.moving_task = NULL;
- mc.mm = NULL;
mem_cgroup_end_move(from);
- memcg_oom_recover(from);
- memcg_oom_recover(to);
- wake_up_all(&mc.waitq);
}
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
return 0;
/* We move charges only when we move a owner of the mm */
if (mm->owner == p) {
- /*
- * We do all the move charge works under one mmap_sem to
- * avoid deadlock with down_write(&mmap_sem)
- * -> try_charge() -> if (mc.moving_task) -> sleep.
- */
- down_read(&mm->mmap_sem);
-
VM_BUG_ON(mc.from);
VM_BUG_ON(mc.to);
VM_BUG_ON(mc.precharge);
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
- VM_BUG_ON(mc.moving_task);
- VM_BUG_ON(mc.mm);
-
mem_cgroup_start_move(from);
spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
- mc.precharge = 0;
- mc.moved_charge = 0;
- mc.moved_swap = 0;
spin_unlock(&mc.lock);
- mc.moving_task = current;
- mc.mm = mm;
+ /* We set mc.moving_task later */
ret = mem_cgroup_precharge_mc(mm);
if (ret)
mem_cgroup_clear_mc();
- /* We call up_read() and mmput() in clear_mc(). */
- } else
- mmput(mm);
+ }
+ mmput(mm);
}
return ret;
}
spinlock_t *ptl;
retry:
+ VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
struct vm_area_struct *vma;
lru_add_drain_all();
- /* We've already held the mmap_sem */
+retry:
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ /*
+ * Someone who are holding the mmap_sem might be waiting in
+ * waitq. So we cancel all extra charges, wake up all waiters,
+ * and retry. Because we cancel precharges, we might not be able
+ * to move enough charges, but moving charge is a best-effort
+ * feature anyway, so it wouldn't be a big problem.
+ */
+ __mem_cgroup_clear_mc();
+ cond_resched();
+ goto retry;
+ }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
int ret;
struct mm_walk mem_cgroup_move_charge_walk = {
*/
break;
}
+ up_read(&mm->mmap_sem);
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct task_struct *p,
bool threadgroup)
{
- if (!mc.mm)
+ struct mm_struct *mm;
+
+ if (!mc.to)
/* no need to move charge */
return;
- mem_cgroup_move_charge(mc.mm);
+ mm = get_task_mm(p);
+ if (mm) {
+ mem_cgroup_move_charge(mm);
+ mmput(mm);
+ }
mem_cgroup_clear_mc();
}
#else /* !CONFIG_MMU */