]> git.karo-electronics.de Git - linux-beck.git/blobdiff - mm/memcontrol.c
mm: memcg: remove optimization of keeping the root_mem_cgroup LRU lists empty
[linux-beck.git] / mm / memcontrol.c
index 6edef95fecf4bf2c9f1debeaaabf63d7bb82c94f..ad7f36f676ffbb7fdd9fa0545e89bcc02e153515 100644 (file)
@@ -123,6 +123,13 @@ struct mem_cgroup_stat_cpu {
        unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 
+struct mem_cgroup_reclaim_iter {
+       /* css_id of the last scanned hierarchy member */
+       int position;
+       /* scan generation, increased every round-trip */
+       unsigned int generation;
+};
+
 /*
  * per-zone information in memory controller.
  */
@@ -133,6 +140,8 @@ struct mem_cgroup_per_zone {
        struct list_head        lists[NR_LRU_LISTS];
        unsigned long           count[NR_LRU_LISTS];
 
+       struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+
        struct zone_reclaim_stat reclaim_stat;
        struct rb_node          tree_node;      /* RB tree node */
        unsigned long long      usage_in_excess;/* Set to the value by which */
@@ -233,11 +242,6 @@ struct mem_cgroup {
         * per zone LRU lists.
         */
        struct mem_cgroup_lru_info info;
-       /*
-        * While reclaiming in a hierarchy, we cache the last child we
-        * reclaimed from.
-        */
-       int last_scanned_child;
        int last_scanned_node;
 #if MAX_NUMNODES > 1
        nodemask_t      scan_nodes;
@@ -366,8 +370,6 @@ enum charge_type {
 #define MEM_CGROUP_RECLAIM_NOSWAP      (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 #define MEM_CGROUP_RECLAIM_SHRINK_BIT  0x1
 #define MEM_CGROUP_RECLAIM_SHRINK      (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
-#define MEM_CGROUP_RECLAIM_SOFT_BIT    0x2
-#define MEM_CGROUP_RECLAIM_SOFT                (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
 
 static void mem_cgroup_get(struct mem_cgroup *memcg);
 static void mem_cgroup_put(struct mem_cgroup *memcg);
@@ -853,13 +855,33 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
        return memcg;
 }
 
-static struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
-                                         struct mem_cgroup *prev,
-                                         bool reclaim)
+/**
+ * mem_cgroup_iter - iterate over memory cgroup hierarchy
+ * @root: hierarchy root
+ * @prev: previously returned memcg, NULL on first invocation
+ * @reclaim: cookie for shared reclaim walks, NULL for full walks
+ *
+ * Returns references to children of the hierarchy below @root, or
+ * @root itself, or %NULL after a full round-trip.
+ *
+ * Caller must pass the return value in @prev on subsequent
+ * invocations for reference counting, or use mem_cgroup_iter_break()
+ * to cancel a hierarchy walk before the round-trip is complete.
+ *
+ * Reclaimers can specify a zone and a priority level in @reclaim to
+ * divide up the memcgs in the hierarchy among all concurrent
+ * reclaimers operating on the same zone and priority.
+ */
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
+                                  struct mem_cgroup *prev,
+                                  struct mem_cgroup_reclaim_cookie *reclaim)
 {
        struct mem_cgroup *memcg = NULL;
        int id = 0;
 
+       if (mem_cgroup_disabled())
+               return NULL;
+
        if (!root)
                root = root_mem_cgroup;
 
@@ -876,10 +898,20 @@ static struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
        }
 
        while (!memcg) {
+               struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
                struct cgroup_subsys_state *css;
 
-               if (reclaim)
-                       id = root->last_scanned_child;
+               if (reclaim) {
+                       int nid = zone_to_nid(reclaim->zone);
+                       int zid = zone_idx(reclaim->zone);
+                       struct mem_cgroup_per_zone *mz;
+
+                       mz = mem_cgroup_zoneinfo(root, nid, zid);
+                       iter = &mz->reclaim_iter[reclaim->priority];
+                       if (prev && reclaim->generation != iter->generation)
+                               return NULL;
+                       id = iter->position;
+               }
 
                rcu_read_lock();
                css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
@@ -891,8 +923,13 @@ static struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                        id = 0;
                rcu_read_unlock();
 
-               if (reclaim)
-                       root->last_scanned_child = id;
+               if (reclaim) {
+                       iter->position = id;
+                       if (!css)
+                               iter->generation++;
+                       else if (!prev && memcg)
+                               reclaim->generation = iter->generation;
+               }
 
                if (prev && !css)
                        return NULL;
@@ -900,8 +937,13 @@ static struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
        return memcg;
 }
 
-static void mem_cgroup_iter_break(struct mem_cgroup *root,
-                                 struct mem_cgroup *prev)
+/**
+ * mem_cgroup_iter_break - abort a hierarchy walk prematurely
+ * @root: hierarchy root
+ * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
+ */
+void mem_cgroup_iter_break(struct mem_cgroup *root,
+                          struct mem_cgroup *prev)
 {
        if (!root)
                root = root_mem_cgroup;
@@ -915,14 +957,14 @@ static void mem_cgroup_iter_break(struct mem_cgroup *root,
  * be used for reference counting.
  */
 #define for_each_mem_cgroup_tree(iter, root)           \
-       for (iter = mem_cgroup_iter(root, NULL, false); \
+       for (iter = mem_cgroup_iter(root, NULL, NULL);  \
             iter != NULL;                              \
-            iter = mem_cgroup_iter(root, iter, false))
+            iter = mem_cgroup_iter(root, iter, NULL))
 
 #define for_each_mem_cgroup(iter)                      \
-       for (iter = mem_cgroup_iter(NULL, NULL, false); \
+       for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
             iter != NULL;                              \
-            iter = mem_cgroup_iter(NULL, iter, false))
+            iter = mem_cgroup_iter(NULL, iter, NULL))
 
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
@@ -989,8 +1031,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
        mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
        /* huge page split is done under lru_lock. so, we have no races. */
        MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
-       if (mem_cgroup_is_root(pc->mem_cgroup))
-               return;
        VM_BUG_ON(list_empty(&pc->lru));
        list_del_init(&pc->lru);
 }
@@ -1015,13 +1055,11 @@ void mem_cgroup_rotate_reclaimable_page(struct page *page)
                return;
 
        pc = lookup_page_cgroup(page);
-       /* unused or root page is not rotated. */
+       /* unused page is not rotated. */
        if (!PageCgroupUsed(pc))
                return;
        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
        smp_rmb();
-       if (mem_cgroup_is_root(pc->mem_cgroup))
-               return;
        mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
        list_move_tail(&pc->lru, &mz->lists[lru]);
 }
@@ -1035,13 +1073,11 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
                return;
 
        pc = lookup_page_cgroup(page);
-       /* unused or root page is not rotated. */
+       /* unused page is not rotated. */
        if (!PageCgroupUsed(pc))
                return;
        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
        smp_rmb();
-       if (mem_cgroup_is_root(pc->mem_cgroup))
-               return;
        mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
        list_move(&pc->lru, &mz->lists[lru]);
 }
@@ -1073,8 +1109,6 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
        /* huge page split is done under lru_lock. so, we have no races. */
        MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
        SetPageCgroupAcctLRU(pc);
-       if (mem_cgroup_is_root(pc->mem_cgroup))
-               return;
        list_add(&pc->lru, &mz->lists[lru]);
 }
 
@@ -1529,6 +1563,42 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
        return min(limit, memsw);
 }
 
+static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
+                                       gfp_t gfp_mask,
+                                       unsigned long flags)
+{
+       unsigned long total = 0;
+       bool noswap = false;
+       int loop;
+
+       if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
+               noswap = true;
+       if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
+               noswap = true;
+
+       for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
+               if (loop)
+                       drain_all_stock_async(memcg);
+               total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
+               /*
+                * Allow limit shrinkers, which are triggered directly
+                * by userspace, to catch signals and stop reclaim
+                * after minimal progress, regardless of the margin.
+                */
+               if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
+                       break;
+               if (mem_cgroup_margin(memcg))
+                       break;
+               /*
+                * If nothing was reclaimed after two attempts, there
+                * may be no reclaimable pages in this hierarchy.
+                */
+               if (loop && !total)
+                       break;
+       }
+       return total;
+}
+
 /**
  * test_mem_cgroup_node_reclaimable
  * @mem: the target memcg
@@ -1666,58 +1736,34 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 }
 #endif
 
-/*
- * Scan the hierarchy if needed to reclaim memory. We remember the last child
- * we reclaimed from, so that we don't end up penalizing one child extensively
- * based on its position in the children list.
- *
- * root_memcg is the original ancestor that we've been reclaim from.
- *
- * We give up and return to the caller when we visit root_memcg twice.
- * (other groups can be removed while we're walking....)
- *
- * If shrink==true, for avoiding to free too much, this returns immedieately.
- */
-static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
-                                               struct zone *zone,
-                                               gfp_t gfp_mask,
-                                               unsigned long reclaim_options,
-                                               unsigned long *total_scanned)
+static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
+                                  struct zone *zone,
+                                  gfp_t gfp_mask,
+                                  unsigned long *total_scanned)
 {
        struct mem_cgroup *victim = NULL;
-       int ret, total = 0;
+       int total = 0;
        int loop = 0;
-       bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
-       bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
-       bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
        unsigned long excess;
        unsigned long nr_scanned;
+       struct mem_cgroup_reclaim_cookie reclaim = {
+               .zone = zone,
+               .priority = 0,
+       };
 
        excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
 
-       /* If memsw_is_minimum==1, swap-out is of-no-use. */
-       if (!check_soft && !shrink && root_memcg->memsw_is_minimum)
-               noswap = true;
-
        while (1) {
-               victim = mem_cgroup_iter(root_memcg, victim, true);
+               victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
                if (!victim) {
                        loop++;
-                       /*
-                        * We are not draining per cpu cached charges during
-                        * soft limit reclaim  because global reclaim doesn't
-                        * care about charges. It tries to free some memory and
-                        * charges will not give any.
-                        */
-                       if (!check_soft && loop >= 1)
-                               drain_all_stock_async(root_memcg);
                        if (loop >= 2) {
                                /*
                                 * If we have not been able to reclaim
                                 * anything, it might because there are
                                 * no reclaimable pages under this hierarchy
                                 */
-                               if (!check_soft || !total)
+                               if (!total)
                                        break;
                                /*
                                 * We want to do more targeted reclaim.
@@ -1731,30 +1777,12 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
                        }
                        continue;
                }
-               if (!mem_cgroup_reclaimable(victim, noswap)) {
-                       /* this cgroup's local usage == 0 */
+               if (!mem_cgroup_reclaimable(victim, false))
                        continue;
-               }
-               /* we use swappiness of local cgroup */
-               if (check_soft) {
-                       ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
-                               noswap, zone, &nr_scanned);
-                       *total_scanned += nr_scanned;
-               } else
-                       ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
-                                               noswap);
-               total += ret;
-               /*
-                * At shrinking usage, we can't check we should stop here or
-                * reclaim more. It's depends on callers. last_scanned_child
-                * will work enough for keeping fairness under tree.
-                */
-               if (shrink)
-                       break;
-               if (check_soft) {
-                       if (!res_counter_soft_limit_excess(&root_memcg->res))
-                               break;
-               } else if (mem_cgroup_margin(root_memcg))
+               total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
+                                                    zone, &nr_scanned);
+               *total_scanned += nr_scanned;
+               if (!res_counter_soft_limit_excess(&root_memcg->res))
                        break;
        }
        mem_cgroup_iter_break(root_memcg, victim);
@@ -2251,8 +2279,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
        if (!(gfp_mask & __GFP_WAIT))
                return CHARGE_WOULDBLOCK;
 
-       ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                             gfp_mask, flags, NULL);
+       ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
                return CHARGE_RETRY;
        /*
@@ -3529,9 +3556,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                if (!ret)
                        break;
 
-               mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
-                                               MEM_CGROUP_RECLAIM_SHRINK,
-                                               NULL);
+               mem_cgroup_reclaim(memcg, GFP_KERNEL,
+                                  MEM_CGROUP_RECLAIM_SHRINK);
                curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
@@ -3589,10 +3615,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
                if (!ret)
                        break;
 
-               mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
-                                               MEM_CGROUP_RECLAIM_NOSWAP |
-                                               MEM_CGROUP_RECLAIM_SHRINK,
-                                               NULL);
+               mem_cgroup_reclaim(memcg, GFP_KERNEL,
+                                  MEM_CGROUP_RECLAIM_NOSWAP |
+                                  MEM_CGROUP_RECLAIM_SHRINK);
                curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
@@ -3635,10 +3660,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                        break;
 
                nr_scanned = 0;
-               reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
-                                               gfp_mask,
-                                               MEM_CGROUP_RECLAIM_SOFT,
-                                               &nr_scanned);
+               reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone,
+                                                   gfp_mask, &nr_scanned);
                nr_reclaimed += reclaimed;
                *total_scanned += nr_scanned;
                spin_lock(&mctz->lock);
@@ -5028,7 +5051,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                res_counter_init(&memcg->res, NULL);
                res_counter_init(&memcg->memsw, NULL);
        }
-       memcg->last_scanned_child = 0;
        memcg->last_scanned_node = MAX_NUMNODES;
        INIT_LIST_HEAD(&memcg->oom_notify);