memcg: debugging facility to access dangling memcgs

author Glauber Costa <glommer@parallels.com>

Thu, 9 May 2013 23:57:19 +0000 (09:57 +1000)

committer Stephen Rothwell <sfr@canb.auug.org.au>

Tue, 21 May 2013 04:17:43 +0000 (14:17 +1000)
author Glauber Costa <glommer@parallels.com>
Thu, 9 May 2013 23:57:19 +0000 (09:57 +1000)
committer Stephen Rothwell <sfr@canb.auug.org.au>
Tue, 21 May 2013 04:17:43 +0000 (14:17 +1000)
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt

index ddf4f93967a94e1385d378ac257f5e919da77cfb..0683ae4d13ef7b82404cfcf1ee3c7df8e5dce9c4 100644 (file)
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -72,6 +72,7 @@ Brief summary of control files.
   memory.move_charge_at_immigrate # set/show controls of moving charges
   memory.oom_control             # set/show oom controls.
   memory.numa_stat               # show the number of memory usage per numa node
+ memory.dangling_memcgs          # show debugging information about dangling groups
  
   memory.kmem.limit_in_bytes      # set/show hard limit for kernel memory
   memory.kmem.usage_in_bytes      # show current kernel memory allocation
@@ -581,6 +582,21 @@ unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
  
  And we have total = file + anon + unevictable.
  
+5.7 dangling_memcgs
+
+This file will only be ever present in the root cgroup, if the option
+CONFIG_MEMCG_DEBUG_ASYNC_DESTROY is set. When a memcg is destroyed, the memory
+consumed by it may not be immediately freed. This is because when some
+extensions are used, such as swap or kernel memory, objects can outlive the
+group and hold a reference to it.
+
+If this is the case, the dangling_memcgs file will show information about what
+are the memcgs still alive, and which references are still preventing it to be
+freed. There is nothing wrong with that, but it is very useful when debugging,
+to know where this memory is being held. This is a developer-oriented debugging
+facility only, and no guarantees of interface stability will be given. The file
+is read-only, and has the sole purpose of displaying information.
+
  6. Hierarchy support
  
  The memory controller supports a deep hierarchy and hierarchical accounting.
diff --git a/init/Kconfig b/init/Kconfig

index a29c8cd60c42a81b786bb2787976575c9be20903..076a6342bc7677192fe3fd1af0e5c022606eb86d 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -939,6 +939,23 @@ config MEMCG_KMEM
           the kmem extension can use it to guarantee that no group of processes
           will ever exhaust kernel resources alone.
  
+config MEMCG_DEBUG_ASYNC_DESTROY
+       bool "Memory Resource Controller Debug assynchronous object destruction"
+       depends on MEMCG_KMEM || MEMCG_SWAP
+       default n
+       help
+         When a memcg is destroyed, the memory
+         consumed by it may not be immediately freed. This is because when some
+         extensions are used, such as swap or kernel memory, objects can
+         outlive the group and hold a reference to it.
+
+         If this is the case, the dangling_memcgs file will show information
+         about what are the memcgs still alive, and which references are still
+         preventing it to be freed. There is nothing wrong with that, but it is
+         very useful when debugging, to know where this memory is being held.
+         This is a developer-oriented debugging facility only, and no
+         guarantees of interface stability will be given.
+
  config CGROUP_HUGETLB
         bool "HugeTLB Resource Controller for Control Groups"
         depends on RESOURCE_COUNTERS && HUGETLB_PAGE
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index caff46388129558e8396bfe282af761970b55612..e34da3c07a850c11a8ae451532ab3114dd7b769d 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -321,14 +321,31 @@ struct mem_cgroup {
         /* thresholds for mem+swap usage. RCU-protected */
         struct mem_cgroup_thresholds memsw_thresholds;
  
-       /* For oom notifier event fd */
-       struct list_head oom_notify;
+       union {
+               /* For oom notifier event fd */
+               struct list_head oom_notify;
+               /*
+                * we can only trigger an oom event if the memcg is alive.
+                * so we will reuse this field to hook the memcg in the list
+                * of dead memcgs.
+                */
+               struct list_head dead;
+       };
  
-       /*
-        * Should we move charges of a task when a task is moved into this
-        * mem_cgroup ? And what type of charges should we move ?
-        */
-       unsigned long   move_charge_at_immigrate;
+       union {
+               /*
+                * Should we move charges of a task when a task is moved into
+                * this mem_cgroup ? And what type of charges should we move ?
+                */
+               unsigned long move_charge_at_immigrate;
+
+               /*
+                * We are no longer concerned about moving charges after memcg
+                * is dead. So we will fill this up with its name, to aid
+                * debugging.
+                */
+               char *memcg_name;
+       };
         /*
          * set > 0 if pages under this cgroup are moving to other cgroup.
          */
@@ -382,6 +399,55 @@ static size_t memcg_size(void)
                 nr_node_ids * sizeof(struct mem_cgroup_per_node);
  }
  
+#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY
+static LIST_HEAD(dangling_memcgs);
+static DEFINE_MUTEX(dangling_memcgs_mutex);
+
+static inline void memcg_dangling_free(struct mem_cgroup *memcg)
+{
+       mutex_lock(&dangling_memcgs_mutex);
+       list_del(&memcg->dead);
+       mutex_unlock(&dangling_memcgs_mutex);
+       free_pages((unsigned long)memcg->memcg_name, 0);
+}
+
+static inline void memcg_dangling_add(struct mem_cgroup *memcg)
+{
+       /*
+        * cgroup.c will do page-sized allocations most of the time,
+        * so we'll just follow the pattern. Also, __get_free_pages
+        * is a better interface than kmalloc for us here, because
+        * we'd like this memory to be always billed to the root cgroup,
+        * not to the process removing the memcg. While kmalloc would
+        * require us to wrap it into memcg_stop/resume_kmem_account,
+        * with __get_free_pages we just don't pass the memcg flag.
+        */
+       memcg->memcg_name = (char *)__get_free_pages(GFP_KERNEL, 0);
+
+       /*
+        * we will, in general, just ignore failures. No need to go crazy,
+        * being this just a debugging interface. It is nice to copy a memcg
+        * name over, but if we (unlikely) can't, just the address will do
+        */
+       if (!memcg->memcg_name)
+               goto add_list;
+
+       if (cgroup_path(memcg->css.cgroup, memcg->memcg_name, PAGE_SIZE) < 0) {
+               free_pages((unsigned long)memcg->memcg_name, 0);
+               memcg->memcg_name = NULL;
+       }
+
+add_list:
+       INIT_LIST_HEAD(&memcg->dead);
+       mutex_lock(&dangling_memcgs_mutex);
+       list_add(&memcg->dead, &dangling_memcgs);
+       mutex_unlock(&dangling_memcgs_mutex);
+}
+#else
+static inline void memcg_dangling_free(struct mem_cgroup *memcg) {}
+static inline void memcg_dangling_add(struct mem_cgroup *memcg) {}
+#endif
+
  /* internal only representation about the status of kmem accounting. */
  enum {
         KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
@@ -5103,6 +5169,107 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
         return simple_read_from_buffer(buf, nbytes, ppos, str, len);
  }
  
+#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY
+static void
+mem_cgroup_dangling_swap(struct mem_cgroup *memcg, struct seq_file *m)
+{
+#ifdef CONFIG_MEMCG_SWAP
+       u64 kmem;
+       u64 memsw;
+
+       /*
+        * kmem will also propagate here, so we are only interested in the
+        * difference.  See comment in mem_cgroup_reparent_charges for details.
+        *
+        * We could save this value for later consumption by kmem reports, but
+        * there is not a lot of problem if the figures differ slightly.
+        */
+       kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE);
+       memsw = res_counter_read_u64(&memcg->memsw, RES_USAGE) - kmem;
+       seq_printf(m, "\t%llu swap bytes\n", memsw);
+#endif
+}
+
+
+static void
+mem_cgroup_dangling_tcp(struct mem_cgroup *memcg, struct seq_file *m)
+{
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
+       struct tcp_memcontrol *tcp = &memcg->tcp_mem;
+       s64 tcp_socks;
+       u64 tcp_bytes;
+
+       tcp_socks = percpu_counter_sum_positive(&tcp->tcp_sockets_allocated);
+       tcp_bytes = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
+       seq_printf(m, "\t%llu tcp bytes", tcp_bytes);
+       /*
+        * if tcp_bytes == 0, tcp_socks != 0 is a bug. One more reason to print
+        * it!
+        */
+       if (tcp_bytes || tcp_socks)
+               seq_printf(m, ", in %lld sockets", tcp_socks);
+       seq_printf(m, "\n");
+
+#endif
+}
+
+static void
+mem_cgroup_dangling_kmem(struct mem_cgroup *memcg, struct seq_file *m)
+{
+#ifdef CONFIG_MEMCG_KMEM
+       u64 kmem;
+       struct memcg_cache_params *params;
+
+       kmem = res_counter_read_u64(&memcg->kmem, RES_USAGE);
+       seq_printf(m, "\t%llu kmem bytes", kmem);
+
+       /* list below may not be initialized, so not even try */
+       if (!kmem)
+               return;
+
+       seq_printf(m, " in caches");
+       mutex_lock(&memcg->slab_caches_mutex);
+       list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
+                       struct kmem_cache *s = memcg_params_to_cache(params);
+
+               seq_printf(m, " %s", s->name);
+       }
+       mutex_unlock(&memcg->slab_caches_mutex);
+       seq_printf(m, "\n");
+#endif
+}
+
+/*
+ * After a memcg is destroyed, it may still be kept around in memory.
+ * Currently, the two main reasons for it are swap entries, and kernel memory.
+ * Because they will be freed assynchronously, they will pin the memcg structure
+ * and its resources until the last reference goes away.
+ *
+ * This root-only file will show information about which users
+ */
+static int mem_cgroup_dangling_read(struct cgroup *cont, struct cftype *cft,
+                                       struct seq_file *m)
+{
+       struct mem_cgroup *memcg;
+
+       mutex_lock(&dangling_memcgs_mutex);
+
+       list_for_each_entry(memcg, &dangling_memcgs, dead) {
+               if (memcg->memcg_name)
+                       seq_printf(m, "%s:\n", memcg->memcg_name);
+               else
+                       seq_printf(m, "%p (name lost):\n", memcg);
+
+               mem_cgroup_dangling_swap(memcg, m);
+               mem_cgroup_dangling_tcp(memcg, m);
+               mem_cgroup_dangling_kmem(memcg, m);
+       }
+
+       mutex_unlock(&dangling_memcgs_mutex);
+       return 0;
+}
+#endif
+
  static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
  {
         int ret = -EINVAL;
@@ -6003,6 +6170,14 @@ static struct cftype mem_cgroup_files[] = {
                 .read_seq_string = mem_cgroup_slabinfo_read,
         },
  #endif
+#endif
+
+#ifdef CONFIG_MEMCG_DEBUG_ASYNC_DESTROY
+       {
+               .name = "dangling_memcgs",
+               .read_seq_string = mem_cgroup_dangling_read,
+               .flags = CFTYPE_ONLY_ON_ROOT,
+       },
  #endif
         { },    /* terminate */
  };
@@ -6153,6 +6328,8 @@ static void free_work(struct work_struct *work)
         struct mem_cgroup *memcg;
  
         memcg = container_of(work, struct mem_cgroup, work_freeing);
+
+       memcg_dangling_free(memcg);
         __mem_cgroup_free(memcg);
  }
  
@@ -6347,6 +6524,7 @@ static void mem_cgroup_css_free(struct cgroup *cont)
  
         kmem_cgroup_destroy(memcg);
  
+       memcg_dangling_add(memcg);
         mem_cgroup_put(memcg);
  }
author	Glauber Costa <glommer@parallels.com>
	Thu, 9 May 2013 23:57:19 +0000 (09:57 +1000)
committer	Stephen Rothwell <sfr@canb.auug.org.au>
	Tue, 21 May 2013 04:17:43 +0000 (14:17 +1000)
Documentation/cgroups/memory.txt		patch \| blob \| history
init/Kconfig		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history