mm/memcontrol.c

   1 /* memcontrol.c - Memory Controller
   2  *
   3  * Copyright IBM Corporation, 2007
   4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5  *
   6  * Copyright 2007 OpenVZ SWsoft Inc
   7  * Author: Pavel Emelianov <xemul@openvz.org>
   8  *
   9  * Memory thresholds
  10  * Copyright (C) 2009 Nokia Corporation
  11  * Author: Kirill A. Shutemov
  12  *
  13  * This program is free software; you can redistribute it and/or modify
  14  * it under the terms of the GNU General Public License as published by
  15  * the Free Software Foundation; either version 2 of the License, or
  16  * (at your option) any later version.
  17  *
  18  * This program is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21  * GNU General Public License for more details.
  22  */
  23
  24 #include <linux/res_counter.h>
  25 #include <linux/memcontrol.h>
  26 #include <linux/cgroup.h>
  27 #include <linux/mm.h>
  28 #include <linux/hugetlb.h>
  29 #include <linux/pagemap.h>
  30 #include <linux/smp.h>
  31 #include <linux/page-flags.h>
  32 #include <linux/backing-dev.h>
  33 #include <linux/bit_spinlock.h>
  34 #include <linux/rcupdate.h>
  35 #include <linux/limits.h>
  36 #include <linux/export.h>
  37 #include <linux/mutex.h>
  38 #include <linux/rbtree.h>
  39 #include <linux/slab.h>
  40 #include <linux/swap.h>
  41 #include <linux/swapops.h>
  42 #include <linux/spinlock.h>
  43 #include <linux/eventfd.h>
  44 #include <linux/sort.h>
  45 #include <linux/fs.h>
  46 #include <linux/seq_file.h>
  47 #include <linux/vmalloc.h>
  48 #include <linux/mm_inline.h>
  49 #include <linux/page_cgroup.h>
  50 #include <linux/cpu.h>
  51 #include <linux/oom.h>
  52 #include "internal.h"
  53 #include <net/sock.h>
  54 #include <net/tcp_memcontrol.h>
  55
  56 #include <asm/uaccess.h>
  57
  58 #include <trace/events/vmscan.h>
  59
  60 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
  61 #define MEM_CGROUP_RECLAIM_RETRIES      5
  62 struct mem_cgroup *root_mem_cgroup __read_mostly;
  63
  64 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  65 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
  66 int do_swap_account __read_mostly;
  67
  68 /* for remember boot option*/
  69 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
  70 static int really_do_swap_account __initdata = 1;
  71 #else
  72 static int really_do_swap_account __initdata = 0;
  73 #endif
  74
  75 #else
  76 #define do_swap_account         (0)
  77 #endif
  78
  79
  80 /*
  81  * Statistics for memory cgroup.
  82  */
  83 enum mem_cgroup_stat_index {
  84         /*
  85          * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  86          */
  87         MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
  88         MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
  89         MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
  90         MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
  91         MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
  92         MEM_CGROUP_ON_MOVE,     /* someone is moving account between groups */
  93         MEM_CGROUP_STAT_NSTATS,
  94 };
  95
  96 enum mem_cgroup_events_index {
  97         MEM_CGROUP_EVENTS_PGPGIN,       /* # of pages paged in */
  98         MEM_CGROUP_EVENTS_PGPGOUT,      /* # of pages paged out */
  99         MEM_CGROUP_EVENTS_COUNT,        /* # of pages paged in/out */
 100         MEM_CGROUP_EVENTS_PGFAULT,      /* # of page-faults */
 101         MEM_CGROUP_EVENTS_PGMAJFAULT,   /* # of major page-faults */
 102         MEM_CGROUP_EVENTS_NSTATS,
 103 };
 104 /*
 105  * Per memcg event counter is incremented at every pagein/pageout. With THP,
 106  * it will be incremated by the number of pages. This counter is used for
 107  * for trigger some periodic events. This is straightforward and better
 108  * than using jiffies etc. to handle periodic memcg event.
 109  */
 110 enum mem_cgroup_events_target {
 111         MEM_CGROUP_TARGET_THRESH,
 112         MEM_CGROUP_TARGET_SOFTLIMIT,
 113         MEM_CGROUP_TARGET_NUMAINFO,
 114         MEM_CGROUP_NTARGETS,
 115 };
 116 #define THRESHOLDS_EVENTS_TARGET (128)
 117 #define SOFTLIMIT_EVENTS_TARGET (1024)
 118 #define NUMAINFO_EVENTS_TARGET  (1024)
 119
 120 struct mem_cgroup_stat_cpu {
 121         long count[MEM_CGROUP_STAT_NSTATS];
 122         unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
 123         unsigned long targets[MEM_CGROUP_NTARGETS];
 124 };
 125
 126 struct mem_cgroup_reclaim_iter {
 127         /* css_id of the last scanned hierarchy member */
 128         int position;
 129         /* scan generation, increased every round-trip */
 130         unsigned int generation;
 131 };
 132
 133 /*
 134  * per-zone information in memory controller.
 135  */
 136 struct mem_cgroup_per_zone {
 137         struct lruvec           lruvec;
 138         unsigned long           count[NR_LRU_LISTS];
 139
 140         struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
 141
 142         struct zone_reclaim_stat reclaim_stat;
 143         struct rb_node          tree_node;      /* RB tree node */
 144         unsigned long long      usage_in_excess;/* Set to the value by which */
 145                                                 /* the soft limit is exceeded*/
 146         bool                    on_tree;
 147         struct mem_cgroup       *mem;           /* Back pointer, we cannot */
 148                                                 /* use container_of        */
 149 };
 150 /* Macro for accessing counter */
 151 #define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
 152
 153 struct mem_cgroup_per_node {
 154         struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 155 };
 156
 157 struct mem_cgroup_lru_info {
 158         struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 159 };
 160
 161 /*
 162  * Cgroups above their limits are maintained in a RB-Tree, independent of
 163  * their hierarchy representation
 164  */
 165
 166 struct mem_cgroup_tree_per_zone {
 167         struct rb_root rb_root;
 168         spinlock_t lock;
 169 };
 170
 171 struct mem_cgroup_tree_per_node {
 172         struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
 173 };
 174
 175 struct mem_cgroup_tree {
 176         struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 177 };
 178
 179 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 180
 181 struct mem_cgroup_threshold {
 182         struct eventfd_ctx *eventfd;
 183         u64 threshold;
 184 };
 185
 186 /* For threshold */
 187 struct mem_cgroup_threshold_ary {
 188         /* An array index points to threshold just below usage. */
 189         int current_threshold;
 190         /* Size of entries[] */
 191         unsigned int size;
 192         /* Array of thresholds */
 193         struct mem_cgroup_threshold entries[0];
 194 };
 195
 196 struct mem_cgroup_thresholds {
 197         /* Primary thresholds array */
 198         struct mem_cgroup_threshold_ary *primary;
 199         /*
 200          * Spare threshold array.
 201          * This is needed to make mem_cgroup_unregister_event() "never fail".
 202          * It must be able to store at least primary->size - 1 entries.
 203          */
 204         struct mem_cgroup_threshold_ary *spare;
 205 };
 206
 207 /* for OOM */
 208 struct mem_cgroup_eventfd_list {
 209         struct list_head list;
 210         struct eventfd_ctx *eventfd;
 211 };
 212
 213 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 214 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 215
 216 /*
 217  * The memory controller data structure. The memory controller controls both
 218  * page cache and RSS per cgroup. We would eventually like to provide
 219  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 220  * to help the administrator determine what knobs to tune.
 221  *
 222  * TODO: Add a water mark for the memory controller. Reclaim will begin when
 223  * we hit the water mark. May be even add a low water mark, such that
 224  * no reclaim occurs from a cgroup at it's low water mark, this is
 225  * a feature that will be implemented much later in the future.
 226  */
 227 struct mem_cgroup {
 228         struct cgroup_subsys_state css;
 229         /*
 230          * the counter to account for memory usage
 231          */
 232         struct res_counter res;
 233         /*
 234          * the counter to account for mem+swap usage.
 235          */
 236         struct res_counter memsw;
 237         /*
 238          * the counter to account for kmem usage.
 239          */
 240         struct res_counter kmem;
 241         /*
 242          * Per cgroup active and inactive list, similar to the
 243          * per zone LRU lists.
 244          */
 245         struct mem_cgroup_lru_info info;
 246         int last_scanned_node;
 247 #if MAX_NUMNODES > 1
 248         nodemask_t      scan_nodes;
 249         atomic_t        numainfo_events;
 250         atomic_t        numainfo_updating;
 251 #endif
 252         /*
 253          * Should the accounting and control be hierarchical, per subtree?
 254          */
 255         bool use_hierarchy;
 256
 257         bool            oom_lock;
 258         atomic_t        under_oom;
 259
 260         atomic_t        refcnt;
 261
 262         int     swappiness;
 263         /* OOM-Killer disable */
 264         int             oom_kill_disable;
 265
 266         /* set when res.limit == memsw.limit */
 267         bool            memsw_is_minimum;
 268
 269         /* protect arrays of thresholds */
 270         struct mutex thresholds_lock;
 271
 272         /* thresholds for memory usage. RCU-protected */
 273         struct mem_cgroup_thresholds thresholds;
 274
 275         /* thresholds for mem+swap usage. RCU-protected */
 276         struct mem_cgroup_thresholds memsw_thresholds;
 277
 278         /* For oom notifier event fd */
 279         struct list_head oom_notify;
 280
 281         /*
 282          * Should we move charges of a task when a task is moved into this
 283          * mem_cgroup ? And what type of charges should we move ?
 284          */
 285         unsigned long   move_charge_at_immigrate;
 286         /*
 287          * Should kernel memory limits be stabilished independently
 288          * from user memory ?
 289          */
 290         int             kmem_independent_accounting;
 291         /*
 292          * percpu counter.
 293          */
 294         struct mem_cgroup_stat_cpu *stat;
 295         /*
 296          * used when a cpu is offlined or other synchronizations
 297          * See mem_cgroup_read_stat().
 298          */
 299         struct mem_cgroup_stat_cpu nocpu_base;
 300         spinlock_t pcp_counter_lock;
 301
 302 #ifdef CONFIG_INET
 303         struct tcp_memcontrol tcp_mem;
 304 #endif
 305 };
 306
 307 /* Stuffs for move charges at task migration. */
 308 /*
 309  * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
 310  * left-shifted bitmap of these types.
 311  */
 312 enum move_type {
 313         MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
 314         MOVE_CHARGE_TYPE_FILE,  /* file page(including tmpfs) and swap of it */
 315         NR_MOVE_TYPE,
 316 };
 317
 318 /* "mc" and its members are protected by cgroup_mutex */
 319 static struct move_charge_struct {
 320         spinlock_t        lock; /* for from, to */
 321         struct mem_cgroup *from;
 322         struct mem_cgroup *to;
 323         unsigned long precharge;
 324         unsigned long moved_charge;
 325         unsigned long moved_swap;
 326         struct task_struct *moving_task;        /* a task moving charges */
 327         wait_queue_head_t waitq;                /* a waitq for other context */
 328 } mc = {
 329         .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 330         .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 331 };
 332
 333 static bool move_anon(void)
 334 {
 335         return test_bit(MOVE_CHARGE_TYPE_ANON,
 336                                         &mc.to->move_charge_at_immigrate);
 337 }
 338
 339 static bool move_file(void)
 340 {
 341         return test_bit(MOVE_CHARGE_TYPE_FILE,
 342                                         &mc.to->move_charge_at_immigrate);
 343 }
 344
 345 /*
 346  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 347  * limit reclaim to prevent infinite loops, if they ever occur.
 348  */
 349 #define MEM_CGROUP_MAX_RECLAIM_LOOPS            (100)
 350 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
 351
 352 enum charge_type {
 353         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 354         MEM_CGROUP_CHARGE_TYPE_MAPPED,
 355         MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
 356         MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
 357         MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 358         MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
 359         NR_CHARGE_TYPE,
 360 };
 361
 362 /* for encoding cft->private value on file */
 363
 364 enum mem_type {
 365         _MEM = 0,
 366         _MEMSWAP,
 367         _OOM_TYPE,
 368         _KMEM,
 369 };
 370
 371 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
 372 #define MEMFILE_TYPE(val)       (((val) >> 16) & 0xffff)
 373 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
 374 /* Used for OOM nofiier */
 375 #define OOM_CONTROL             (0)
 376
 377 /*
 378  * Reclaim flags for mem_cgroup_hierarchical_reclaim
 379  */
 380 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT   0x0
 381 #define MEM_CGROUP_RECLAIM_NOSWAP       (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 382 #define MEM_CGROUP_RECLAIM_SHRINK_BIT   0x1
 383 #define MEM_CGROUP_RECLAIM_SHRINK       (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
 384
 385 static void mem_cgroup_get(struct mem_cgroup *memcg);
 386 static void mem_cgroup_put(struct mem_cgroup *memcg);
 387
 388 /* Writing them here to avoid exposing memcg's inner layout */
 389 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 390 #ifdef CONFIG_INET
 391 #include <net/sock.h>
 392 #include <net/ip.h>
 393
 394 static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
 395 void sock_update_memcg(struct sock *sk)
 396 {
 397         /* A socket spends its whole life in the same cgroup */
 398         if (sk->sk_cgrp) {
 399                 WARN_ON(1);
 400                 return;
 401         }
 402         if (static_branch(&memcg_socket_limit_enabled)) {
 403                 struct mem_cgroup *memcg;
 404
 405                 BUG_ON(!sk->sk_prot->proto_cgroup);
 406
 407                 rcu_read_lock();
 408                 memcg = mem_cgroup_from_task(current);
 409                 if (!mem_cgroup_is_root(memcg)) {
 410                         mem_cgroup_get(memcg);
 411                         sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
 412                 }
 413                 rcu_read_unlock();
 414         }
 415 }
 416 EXPORT_SYMBOL(sock_update_memcg);
 417
 418 void sock_release_memcg(struct sock *sk)
 419 {
 420         if (static_branch(&memcg_socket_limit_enabled) && sk->sk_cgrp) {
 421                 struct mem_cgroup *memcg;
 422                 WARN_ON(!sk->sk_cgrp->memcg);
 423                 memcg = sk->sk_cgrp->memcg;
 424                 mem_cgroup_put(memcg);
 425         }
 426 }
 427
 428 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 429 {
 430         if (!memcg || mem_cgroup_is_root(memcg))
 431                 return NULL;
 432
 433         return &memcg->tcp_mem.cg_proto;
 434 }
 435 EXPORT_SYMBOL(tcp_proto_cgroup);
 436 #endif /* CONFIG_INET */
 437 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
 438
 439 static void drain_all_stock_async(struct mem_cgroup *memcg);
 440
 441 static struct mem_cgroup_per_zone *
 442 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
 443 {
 444         return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
 445 }
 446
 447 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
 448 {
 449         return &memcg->css;
 450 }
 451
 452 static struct mem_cgroup_per_zone *
 453 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 454 {
 455         int nid = page_to_nid(page);
 456         int zid = page_zonenum(page);
 457
 458         return mem_cgroup_zoneinfo(memcg, nid, zid);
 459 }
 460
 461 static struct mem_cgroup_tree_per_zone *
 462 soft_limit_tree_node_zone(int nid, int zid)
 463 {
 464         return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 465 }
 466
 467 static struct mem_cgroup_tree_per_zone *
 468 soft_limit_tree_from_page(struct page *page)
 469 {
 470         int nid = page_to_nid(page);
 471         int zid = page_zonenum(page);
 472
 473         return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 474 }
 475
 476 static void
 477 __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
 478                                 struct mem_cgroup_per_zone *mz,
 479                                 struct mem_cgroup_tree_per_zone *mctz,
 480                                 unsigned long long new_usage_in_excess)
 481 {
 482         struct rb_node **p = &mctz->rb_root.rb_node;
 483         struct rb_node *parent = NULL;
 484         struct mem_cgroup_per_zone *mz_node;
 485
 486         if (mz->on_tree)
 487                 return;
 488
 489         mz->usage_in_excess = new_usage_in_excess;
 490         if (!mz->usage_in_excess)
 491                 return;
 492         while (*p) {
 493                 parent = *p;
 494                 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
 495                                         tree_node);
 496                 if (mz->usage_in_excess < mz_node->usage_in_excess)
 497                         p = &(*p)->rb_left;
 498                 /*
 499                  * We can't avoid mem cgroups that are over their soft
 500                  * limit by the same amount
 501                  */
 502                 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 503                         p = &(*p)->rb_right;
 504         }
 505         rb_link_node(&mz->tree_node, parent, p);
 506         rb_insert_color(&mz->tree_node, &mctz->rb_root);
 507         mz->on_tree = true;
 508 }
 509
 510 static void
 511 __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 512                                 struct mem_cgroup_per_zone *mz,
 513                                 struct mem_cgroup_tree_per_zone *mctz)
 514 {
 515         if (!mz->on_tree)
 516                 return;
 517         rb_erase(&mz->tree_node, &mctz->rb_root);
 518         mz->on_tree = false;
 519 }
 520
 521 static void
 522 mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 523                                 struct mem_cgroup_per_zone *mz,
 524                                 struct mem_cgroup_tree_per_zone *mctz)
 525 {
 526         spin_lock(&mctz->lock);
 527         __mem_cgroup_remove_exceeded(memcg, mz, mctz);
 528         spin_unlock(&mctz->lock);
 529 }
 530
 531
 532 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 533 {
 534         unsigned long long excess;
 535         struct mem_cgroup_per_zone *mz;
 536         struct mem_cgroup_tree_per_zone *mctz;
 537         int nid = page_to_nid(page);
 538         int zid = page_zonenum(page);
 539         mctz = soft_limit_tree_from_page(page);
 540
 541         /*
 542          * Necessary to update all ancestors when hierarchy is used.
 543          * because their event counter is not touched.
 544          */
 545         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 546                 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 547                 excess = res_counter_soft_limit_excess(&memcg->res);
 548                 /*
 549                  * We have to update the tree if mz is on RB-tree or
 550                  * mem is over its softlimit.
 551                  */
 552                 if (excess || mz->on_tree) {
 553                         spin_lock(&mctz->lock);
 554                         /* if on-tree, remove it */
 555                         if (mz->on_tree)
 556                                 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
 557                         /*
 558                          * Insert again. mz->usage_in_excess will be updated.
 559                          * If excess is 0, no tree ops.
 560                          */
 561                         __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
 562                         spin_unlock(&mctz->lock);
 563                 }
 564         }
 565 }
 566
 567 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 568 {
 569         int node, zone;
 570         struct mem_cgroup_per_zone *mz;
 571         struct mem_cgroup_tree_per_zone *mctz;
 572
 573         for_each_node_state(node, N_POSSIBLE) {
 574                 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 575                         mz = mem_cgroup_zoneinfo(memcg, node, zone);
 576                         mctz = soft_limit_tree_node_zone(node, zone);
 577                         mem_cgroup_remove_exceeded(memcg, mz, mctz);
 578                 }
 579         }
 580 }
 581
 582 static struct mem_cgroup_per_zone *
 583 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 584 {
 585         struct rb_node *rightmost = NULL;
 586         struct mem_cgroup_per_zone *mz;
 587
 588 retry:
 589         mz = NULL;
 590         rightmost = rb_last(&mctz->rb_root);
 591         if (!rightmost)
 592                 goto done;              /* Nothing to reclaim from */
 593
 594         mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
 595         /*
 596          * Remove the node now but someone else can add it back,
 597          * we will to add it back at the end of reclaim to its correct
 598          * position in the tree.
 599          */
 600         __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
 601         if (!res_counter_soft_limit_excess(&mz->mem->res) ||
 602                 !css_tryget(&mz->mem->css))
 603                 goto retry;
 604 done:
 605         return mz;
 606 }
 607
 608 static struct mem_cgroup_per_zone *
 609 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 610 {
 611         struct mem_cgroup_per_zone *mz;
 612
 613         spin_lock(&mctz->lock);
 614         mz = __mem_cgroup_largest_soft_limit_node(mctz);
 615         spin_unlock(&mctz->lock);
 616         return mz;
 617 }
 618
 619 /*
 620  * Implementation Note: reading percpu statistics for memcg.
 621  *
 622  * Both of vmstat[] and percpu_counter has threshold and do periodic
 623  * synchronization to implement "quick" read. There are trade-off between
 624  * reading cost and precision of value. Then, we may have a chance to implement
 625  * a periodic synchronizion of counter in memcg's counter.
 626  *
 627  * But this _read() function is used for user interface now. The user accounts
 628  * memory usage by memory cgroup and he _always_ requires exact value because
 629  * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 630  * have to visit all online cpus and make sum. So, for now, unnecessary
 631  * synchronization is not implemented. (just implemented for cpu hotplug)
 632  *
 633  * If there are kernel internal actions which can make use of some not-exact
 634  * value, and reading all cpu value can be performance bottleneck in some
 635  * common workload, threashold and synchonization as vmstat[] should be
 636  * implemented.
 637  */
 638 static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 639                                  enum mem_cgroup_stat_index idx)
 640 {
 641         long val = 0;
 642         int cpu;
 643
 644         get_online_cpus();
 645         for_each_online_cpu(cpu)
 646                 val += per_cpu(memcg->stat->count[idx], cpu);
 647 #ifdef CONFIG_HOTPLUG_CPU
 648         spin_lock(&memcg->pcp_counter_lock);
 649         val += memcg->nocpu_base.count[idx];
 650         spin_unlock(&memcg->pcp_counter_lock);
 651 #endif
 652         put_online_cpus();
 653         return val;
 654 }
 655
 656 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
 657                                          bool charge)
 658 {
 659         int val = (charge) ? 1 : -1;
 660         this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
 661 }
 662
 663 void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val)
 664 {
 665         this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
 666 }
 667
 668 void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val)
 669 {
 670         this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
 671 }
 672
 673 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 674                                             enum mem_cgroup_events_index idx)
 675 {
 676         unsigned long val = 0;
 677         int cpu;
 678
 679         for_each_online_cpu(cpu)
 680                 val += per_cpu(memcg->stat->events[idx], cpu);
 681 #ifdef CONFIG_HOTPLUG_CPU
 682         spin_lock(&memcg->pcp_counter_lock);
 683         val += memcg->nocpu_base.events[idx];
 684         spin_unlock(&memcg->pcp_counter_lock);
 685 #endif
 686         return val;
 687 }
 688
 689 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 690                                          bool file, int nr_pages)
 691 {
 692         preempt_disable();
 693
 694         if (file)
 695                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 696                                 nr_pages);
 697         else
 698                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 699                                 nr_pages);
 700
 701         /* pagein of a big page is an event. So, ignore page size */
 702         if (nr_pages > 0)
 703                 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 704         else {
 705                 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 706                 nr_pages = -nr_pages; /* for event */
 707         }
 708
 709         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
 710
 711         preempt_enable();
 712 }
 713
 714 unsigned long
 715 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
 716                         unsigned int lru_mask)
 717 {
 718         struct mem_cgroup_per_zone *mz;
 719         enum lru_list l;
 720         unsigned long ret = 0;
 721
 722         mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 723
 724         for_each_lru(l) {
 725                 if (BIT(l) & lru_mask)
 726                         ret += MEM_CGROUP_ZSTAT(mz, l);
 727         }
 728         return ret;
 729 }
 730
 731 static unsigned long
 732 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 733                         int nid, unsigned int lru_mask)
 734 {
 735         u64 total = 0;
 736         int zid;
 737
 738         for (zid = 0; zid < MAX_NR_ZONES; zid++)
 739                 total += mem_cgroup_zone_nr_lru_pages(memcg,
 740                                                 nid, zid, lru_mask);
 741
 742         return total;
 743 }
 744
 745 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 746                         unsigned int lru_mask)
 747 {
 748         int nid;
 749         u64 total = 0;
 750
 751         for_each_node_state(nid, N_HIGH_MEMORY)
 752                 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
 753         return total;
 754 }
 755
 756 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 757                                        enum mem_cgroup_events_target target)
 758 {
 759         unsigned long val, next;
 760
 761         val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
 762         next = __this_cpu_read(memcg->stat->targets[target]);
 763         /* from time_after() in jiffies.h */
 764         if ((long)next - (long)val < 0) {
 765                 switch (target) {
 766                 case MEM_CGROUP_TARGET_THRESH:
 767                         next = val + THRESHOLDS_EVENTS_TARGET;
 768                         break;
 769                 case MEM_CGROUP_TARGET_SOFTLIMIT:
 770                         next = val + SOFTLIMIT_EVENTS_TARGET;
 771                         break;
 772                 case MEM_CGROUP_TARGET_NUMAINFO:
 773                         next = val + NUMAINFO_EVENTS_TARGET;
 774                         break;
 775                 default:
 776                         break;
 777                 }
 778                 __this_cpu_write(memcg->stat->targets[target], next);
 779                 return true;
 780         }
 781         return false;
 782 }
 783
 784 /*
 785  * Check events in order.
 786  *
 787  */
 788 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 789 {
 790         preempt_disable();
 791         /* threshold event is triggered in finer grain than soft limit */
 792         if (unlikely(mem_cgroup_event_ratelimit(memcg,
 793                                                 MEM_CGROUP_TARGET_THRESH))) {
 794                 bool do_softlimit, do_numainfo;
 795
 796                 do_softlimit = mem_cgroup_event_ratelimit(memcg,
 797                                                 MEM_CGROUP_TARGET_SOFTLIMIT);
 798 #if MAX_NUMNODES > 1
 799                 do_numainfo = mem_cgroup_event_ratelimit(memcg,
 800                                                 MEM_CGROUP_TARGET_NUMAINFO);
 801 #endif
 802                 preempt_enable();
 803
 804                 mem_cgroup_threshold(memcg);
 805                 if (unlikely(do_softlimit))
 806                         mem_cgroup_update_tree(memcg, page);
 807 #if MAX_NUMNODES > 1
 808                 if (unlikely(do_numainfo))
 809                         atomic_inc(&memcg->numainfo_events);
 810 #endif
 811         } else
 812                 preempt_enable();
 813 }
 814
 815 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 816 {
 817         return container_of(cgroup_subsys_state(cont,
 818                                 mem_cgroup_subsys_id), struct mem_cgroup,
 819                                 css);
 820 }
 821
 822 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 823 {
 824         /*
 825          * mm_update_next_owner() may clear mm->owner to NULL
 826          * if it races with swapoff, page migration, etc.
 827          * So this can be called with p == NULL.
 828          */
 829         if (unlikely(!p))
 830                 return NULL;
 831
 832         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 833                                 struct mem_cgroup, css);
 834 }
 835
 836 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 837 {
 838         struct mem_cgroup *memcg = NULL;
 839
 840         if (!mm)
 841                 return NULL;
 842         /*
 843          * Because we have no locks, mm->owner's may be being moved to other
 844          * cgroup. We use css_tryget() here even if this looks
 845          * pessimistic (rather than adding locks here).
 846          */
 847         rcu_read_lock();
 848         do {
 849                 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 850                 if (unlikely(!memcg))
 851                         break;
 852         } while (!css_tryget(&memcg->css));
 853         rcu_read_unlock();
 854         return memcg;
 855 }
 856
 857 /**
 858  * mem_cgroup_iter - iterate over memory cgroup hierarchy
 859  * @root: hierarchy root
 860  * @prev: previously returned memcg, NULL on first invocation
 861  * @reclaim: cookie for shared reclaim walks, NULL for full walks
 862  *
 863  * Returns references to children of the hierarchy below @root, or
 864  * @root itself, or %NULL after a full round-trip.
 865  *
 866  * Caller must pass the return value in @prev on subsequent
 867  * invocations for reference counting, or use mem_cgroup_iter_break()
 868  * to cancel a hierarchy walk before the round-trip is complete.
 869  *
 870  * Reclaimers can specify a zone and a priority level in @reclaim to
 871  * divide up the memcgs in the hierarchy among all concurrent
 872  * reclaimers operating on the same zone and priority.
 873  */
 874 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 875                                    struct mem_cgroup *prev,
 876                                    struct mem_cgroup_reclaim_cookie *reclaim)
 877 {
 878         struct mem_cgroup *memcg = NULL;
 879         int id = 0;
 880
 881         if (mem_cgroup_disabled())
 882                 return NULL;
 883
 884         if (!root)
 885                 root = root_mem_cgroup;
 886
 887         if (prev && !reclaim)
 888                 id = css_id(&prev->css);
 889
 890         if (prev && prev != root)
 891                 css_put(&prev->css);
 892
 893         if (!root->use_hierarchy && root != root_mem_cgroup) {
 894                 if (prev)
 895                         return NULL;
 896                 return root;
 897         }
 898
 899         while (!memcg) {
 900                 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
 901                 struct cgroup_subsys_state *css;
 902
 903                 if (reclaim) {
 904                         int nid = zone_to_nid(reclaim->zone);
 905                         int zid = zone_idx(reclaim->zone);
 906                         struct mem_cgroup_per_zone *mz;
 907
 908                         mz = mem_cgroup_zoneinfo(root, nid, zid);
 909                         iter = &mz->reclaim_iter[reclaim->priority];
 910                         if (prev && reclaim->generation != iter->generation)
 911                                 return NULL;
 912                         id = iter->position;
 913                 }
 914
 915                 rcu_read_lock();
 916                 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
 917                 if (css) {
 918                         if (css == &root->css || css_tryget(css))
 919                                 memcg = container_of(css,
 920                                                      struct mem_cgroup, css);
 921                 } else
 922                         id = 0;
 923                 rcu_read_unlock();
 924
 925                 if (reclaim) {
 926                         iter->position = id;
 927                         if (!css)
 928                                 iter->generation++;
 929                         else if (!prev && memcg)
 930                                 reclaim->generation = iter->generation;
 931                 }
 932
 933                 if (prev && !css)
 934                         return NULL;
 935         }
 936         return memcg;
 937 }
 938
 939 /**
 940  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
 941  * @root: hierarchy root
 942  * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
 943  */
 944 void mem_cgroup_iter_break(struct mem_cgroup *root,
 945                            struct mem_cgroup *prev)
 946 {
 947         if (!root)
 948                 root = root_mem_cgroup;
 949         if (prev && prev != root)
 950                 css_put(&prev->css);
 951 }
 952
 953 /*
 954  * Iteration constructs for visiting all cgroups (under a tree).  If
 955  * loops are exited prematurely (break), mem_cgroup_iter_break() must
 956  * be used for reference counting.
 957  */
 958 #define for_each_mem_cgroup_tree(iter, root)            \
 959         for (iter = mem_cgroup_iter(root, NULL, NULL);  \
 960              iter != NULL;                              \
 961              iter = mem_cgroup_iter(root, iter, NULL))
 962
 963 #define for_each_mem_cgroup(iter)                       \
 964         for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
 965              iter != NULL;                              \
 966              iter = mem_cgroup_iter(NULL, iter, NULL))
 967
 968 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 969 {
 970         return (memcg == root_mem_cgroup);
 971 }
 972
 973 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 974 {
 975         struct mem_cgroup *memcg;
 976
 977         if (!mm)
 978                 return;
 979
 980         rcu_read_lock();
 981         memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 982         if (unlikely(!memcg))
 983                 goto out;
 984
 985         switch (idx) {
 986         case PGMAJFAULT:
 987                 mem_cgroup_pgmajfault(memcg, 1);
 988                 break;
 989         case PGFAULT:
 990                 mem_cgroup_pgfault(memcg, 1);
 991                 break;
 992         default:
 993                 BUG();
 994         }
 995 out:
 996         rcu_read_unlock();
 997 }
 998 EXPORT_SYMBOL(mem_cgroup_count_vm_event);
 999
1000 /**
1001  * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
1002  * @zone: zone of the wanted lruvec
1003  * @mem: memcg of the wanted lruvec
1004  *
1005  * Returns the lru list vector holding pages for the given @zone and
1006  * @mem.  This can be the global zone lruvec, if the memory controller
1007  * is disabled.
1008  */
1009 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1010                                       struct mem_cgroup *memcg)
1011 {
1012         struct mem_cgroup_per_zone *mz;
1013
1014         if (mem_cgroup_disabled())
1015                 return &zone->lruvec;
1016
1017         mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1018         return &mz->lruvec;
1019 }
1020
1021 /*
1022  * Following LRU functions are allowed to be used without PCG_LOCK.
1023  * Operations are called by routine of global LRU independently from memcg.
1024  * What we have to take care of here is validness of pc->mem_cgroup.
1025  *
1026  * Changes to pc->mem_cgroup happens when
1027  * 1. charge
1028  * 2. moving account
1029  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
1030  * It is added to LRU before charge.
1031  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
1032  * When moving account, the page is not on LRU. It's isolated.
1033  */
1034
1035 /**
1036  * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec
1037  * @zone: zone of the page
1038  * @page: the page
1039  * @lru: current lru
1040  *
1041  * This function accounts for @page being added to @lru, and returns
1042  * the lruvec for the given @zone and the memcg @page is charged to.
1043  *
1044  * The callsite is then responsible for physically linking the page to
1045  * the returned lruvec->lists[@lru].
1046  */
1047 struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1048                                        enum lru_list lru)
1049 {
1050         struct mem_cgroup_per_zone *mz;
1051         struct mem_cgroup *memcg;
1052         struct page_cgroup *pc;
1053
1054         if (mem_cgroup_disabled())
1055                 return &zone->lruvec;
1056
1057         pc = lookup_page_cgroup(page);
1058         VM_BUG_ON(PageCgroupAcctLRU(pc));
1059         /*
1060          * putback:                             charge:
1061          * SetPageLRU                           SetPageCgroupUsed
1062          * smp_mb                               smp_mb
1063          * PageCgroupUsed && add to memcg LRU   PageLRU && add to memcg LRU
1064          *
1065          * Ensure that one of the two sides adds the page to the memcg
1066          * LRU during a race.
1067          */
1068         smp_mb();
1069         /*
1070          * If the page is uncharged, it may be freed soon, but it
1071          * could also be swap cache (readahead, swapoff) that needs to
1072          * be reclaimable in the future.  root_mem_cgroup will babysit
1073          * it for the time being.
1074          */
1075         if (PageCgroupUsed(pc)) {
1076                 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1077                 smp_rmb();
1078                 memcg = pc->mem_cgroup;
1079                 SetPageCgroupAcctLRU(pc);
1080         } else
1081                 memcg = root_mem_cgroup;
1082         mz = page_cgroup_zoneinfo(memcg, page);
1083         /* compound_order() is stabilized through lru_lock */
1084         MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
1085         return &mz->lruvec;
1086 }
1087
1088 /**
1089  * mem_cgroup_lru_del_list - account for removing an lru page
1090  * @page: the page
1091  * @lru: target lru
1092  *
1093  * This function accounts for @page being removed from @lru.
1094  *
1095  * The callsite is then responsible for physically unlinking
1096  * @page->lru.
1097  */
1098 void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
1099 {
1100         struct mem_cgroup_per_zone *mz;
1101         struct mem_cgroup *memcg;
1102         struct page_cgroup *pc;
1103
1104         if (mem_cgroup_disabled())
1105                 return;
1106
1107         pc = lookup_page_cgroup(page);
1108         /*
1109          * root_mem_cgroup babysits uncharged LRU pages, but
1110          * PageCgroupUsed is cleared when the page is about to get
1111          * freed.  PageCgroupAcctLRU remembers whether the
1112          * LRU-accounting happened against pc->mem_cgroup or
1113          * root_mem_cgroup.
1114          */
1115         if (TestClearPageCgroupAcctLRU(pc)) {
1116                 VM_BUG_ON(!pc->mem_cgroup);
1117                 memcg = pc->mem_cgroup;
1118         } else
1119                 memcg = root_mem_cgroup;
1120         mz = page_cgroup_zoneinfo(memcg, page);
1121         /* huge page split is done under lru_lock. so, we have no races. */
1122         MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
1123 }
1124
1125 void mem_cgroup_lru_del(struct page *page)
1126 {
1127         mem_cgroup_lru_del_list(page, page_lru(page));
1128 }
1129
1130 /**
1131  * mem_cgroup_lru_move_lists - account for moving a page between lrus
1132  * @zone: zone of the page
1133  * @page: the page
1134  * @from: current lru
1135  * @to: target lru
1136  *
1137  * This function accounts for @page being moved between the lrus @from
1138  * and @to, and returns the lruvec for the given @zone and the memcg
1139  * @page is charged to.
1140  *
1141  * The callsite is then responsible for physically relinking
1142  * @page->lru to the returned lruvec->lists[@to].
1143  */
1144 struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
1145                                          struct page *page,
1146                                          enum lru_list from,
1147                                          enum lru_list to)
1148 {
1149         /* XXX: Optimize this, especially for @from == @to */
1150         mem_cgroup_lru_del_list(page, from);
1151         return mem_cgroup_lru_add_list(zone, page, to);
1152 }
1153
1154 /*
1155  * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
1156  * while it's linked to lru because the page may be reused after it's fully
1157  * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
1158  * It's done under lock_page and expected that zone->lru_lock isnever held.
1159  */
1160 static void mem_cgroup_lru_del_before_commit(struct page *page)
1161 {
1162         enum lru_list lru;
1163         unsigned long flags;
1164         struct zone *zone = page_zone(page);
1165         struct page_cgroup *pc = lookup_page_cgroup(page);
1166
1167         /*
1168          * Doing this check without taking ->lru_lock seems wrong but this
1169          * is safe. Because if page_cgroup's USED bit is unset, the page
1170          * will not be added to any memcg's LRU. If page_cgroup's USED bit is
1171          * set, the commit after this will fail, anyway.
1172          * This all charge/uncharge is done under some mutual execustion.
1173          * So, we don't need to taking care of changes in USED bit.
1174          */
1175         if (likely(!PageLRU(page)))
1176                 return;
1177
1178         spin_lock_irqsave(&zone->lru_lock, flags);
1179         lru = page_lru(page);
1180         /*
1181          * The uncharged page could still be registered to the LRU of
1182          * the stale pc->mem_cgroup.
1183          *
1184          * As pc->mem_cgroup is about to get overwritten, the old LRU
1185          * accounting needs to be taken care of.  Let root_mem_cgroup
1186          * babysit the page until the new memcg is responsible for it.
1187          *
1188          * The PCG_USED bit is guarded by lock_page() as the page is
1189          * swapcache/pagecache.
1190          */
1191         if (PageLRU(page) && PageCgroupAcctLRU(pc) && !PageCgroupUsed(pc)) {
1192                 del_page_from_lru_list(zone, page, lru);
1193                 add_page_to_lru_list(zone, page, lru);
1194         }
1195         spin_unlock_irqrestore(&zone->lru_lock, flags);
1196 }
1197
1198 static void mem_cgroup_lru_add_after_commit(struct page *page)
1199 {
1200         enum lru_list lru;
1201         unsigned long flags;
1202         struct zone *zone = page_zone(page);
1203         struct page_cgroup *pc = lookup_page_cgroup(page);
1204         /*
1205          * putback:                             charge:
1206          * SetPageLRU                           SetPageCgroupUsed
1207          * smp_mb                               smp_mb
1208          * PageCgroupUsed && add to memcg LRU   PageLRU && add to memcg LRU
1209          *
1210          * Ensure that one of the two sides adds the page to the memcg
1211          * LRU during a race.
1212          */
1213         smp_mb();
1214         /* taking care of that the page is added to LRU while we commit it */
1215         if (likely(!PageLRU(page)))
1216                 return;
1217         spin_lock_irqsave(&zone->lru_lock, flags);
1218         lru = page_lru(page);
1219         /*
1220          * If the page is not on the LRU, someone will soon put it
1221          * there.  If it is, and also already accounted for on the
1222          * memcg-side, it must be on the right lruvec as setting
1223          * pc->mem_cgroup and PageCgroupUsed is properly ordered.
1224          * Otherwise, root_mem_cgroup has been babysitting the page
1225          * during the charge.  Move it to the new memcg now.
1226          */
1227         if (PageLRU(page) && !PageCgroupAcctLRU(pc)) {
1228                 del_page_from_lru_list(zone, page, lru);
1229                 add_page_to_lru_list(zone, page, lru);
1230         }
1231         spin_unlock_irqrestore(&zone->lru_lock, flags);
1232 }
1233
1234 /*
1235  * Checks whether given mem is same or in the root_mem_cgroup's
1236  * hierarchy subtree
1237  */
1238 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1239                 struct mem_cgroup *memcg)
1240 {
1241         if (root_memcg != memcg) {
1242                 return (root_memcg->use_hierarchy &&
1243                         css_is_ancestor(&memcg->css, &root_memcg->css));
1244         }
1245
1246         return true;
1247 }
1248
1249 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1250 {
1251         int ret;
1252         struct mem_cgroup *curr = NULL;
1253         struct task_struct *p;
1254
1255         p = find_lock_task_mm(task);
1256         if (!p)
1257                 return 0;
1258         curr = try_get_mem_cgroup_from_mm(p->mm);
1259         task_unlock(p);
1260         if (!curr)
1261                 return 0;
1262         /*
1263          * We should check use_hierarchy of "memcg" not "curr". Because checking
1264          * use_hierarchy of "curr" here make this function true if hierarchy is
1265          * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
1266          * hierarchy(even if use_hierarchy is disabled in "memcg").
1267          */
1268         ret = mem_cgroup_same_or_subtree(memcg, curr);
1269         css_put(&curr->css);
1270         return ret;
1271 }
1272
1273 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
1274 {
1275         unsigned long inactive_ratio;
1276         int nid = zone_to_nid(zone);
1277         int zid = zone_idx(zone);
1278         unsigned long inactive;
1279         unsigned long active;
1280         unsigned long gb;
1281
1282         inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1283                                                 BIT(LRU_INACTIVE_ANON));
1284         active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1285                                               BIT(LRU_ACTIVE_ANON));
1286
1287         gb = (inactive + active) >> (30 - PAGE_SHIFT);
1288         if (gb)
1289                 inactive_ratio = int_sqrt(10 * gb);
1290         else
1291                 inactive_ratio = 1;
1292
1293         return inactive * inactive_ratio < active;
1294 }
1295
1296 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
1297 {
1298         unsigned long active;
1299         unsigned long inactive;
1300         int zid = zone_idx(zone);
1301         int nid = zone_to_nid(zone);
1302
1303         inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1304                                                 BIT(LRU_INACTIVE_FILE));
1305         active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1306                                               BIT(LRU_ACTIVE_FILE));
1307
1308         return (active > inactive);
1309 }
1310
1311 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1312                                                       struct zone *zone)
1313 {
1314         int nid = zone_to_nid(zone);
1315         int zid = zone_idx(zone);
1316         struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1317
1318         return &mz->reclaim_stat;
1319 }
1320
1321 struct zone_reclaim_stat *
1322 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1323 {
1324         struct page_cgroup *pc;
1325         struct mem_cgroup_per_zone *mz;
1326
1327         if (mem_cgroup_disabled())
1328                 return NULL;
1329
1330         pc = lookup_page_cgroup(page);
1331         if (!PageCgroupUsed(pc))
1332                 return NULL;
1333         /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1334         smp_rmb();
1335         mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1336         return &mz->reclaim_stat;
1337 }
1338
1339 #define mem_cgroup_from_res_counter(counter, member)    \
1340         container_of(counter, struct mem_cgroup, member)
1341
1342 /**
1343  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1344  * @mem: the memory cgroup
1345  *
1346  * Returns the maximum amount of memory @mem can be charged with, in
1347  * pages.
1348  */
1349 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1350 {
1351         unsigned long long margin;
1352
1353         margin = res_counter_margin(&memcg->res);
1354         if (do_swap_account)
1355                 margin = min(margin, res_counter_margin(&memcg->memsw));
1356         return margin >> PAGE_SHIFT;
1357 }
1358
1359 int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1360 {
1361         struct cgroup *cgrp = memcg->css.cgroup;
1362
1363         /* root ? */
1364         if (cgrp->parent == NULL)
1365                 return vm_swappiness;
1366
1367         return memcg->swappiness;
1368 }
1369
1370 static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1371 {
1372         int cpu;
1373
1374         get_online_cpus();
1375         spin_lock(&memcg->pcp_counter_lock);
1376         for_each_online_cpu(cpu)
1377                 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1378         memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1379         spin_unlock(&memcg->pcp_counter_lock);
1380         put_online_cpus();
1381
1382         synchronize_rcu();
1383 }
1384
1385 static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1386 {
1387         int cpu;
1388
1389         if (!memcg)
1390                 return;
1391         get_online_cpus();
1392         spin_lock(&memcg->pcp_counter_lock);
1393         for_each_online_cpu(cpu)
1394                 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1395         memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1396         spin_unlock(&memcg->pcp_counter_lock);
1397         put_online_cpus();
1398 }
1399 /*
1400  * 2 routines for checking "mem" is under move_account() or not.
1401  *
1402  * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
1403  *                        for avoiding race in accounting. If true,
1404  *                        pc->mem_cgroup may be overwritten.
1405  *
1406  * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1407  *                        under hierarchy of moving cgroups. This is for
1408  *                        waiting at hith-memory prressure caused by "move".
1409  */
1410
1411 static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
1412 {
1413         VM_BUG_ON(!rcu_read_lock_held());
1414         return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1415 }
1416
1417 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1418 {
1419         struct mem_cgroup *from;
1420         struct mem_cgroup *to;
1421         bool ret = false;
1422         /*
1423          * Unlike task_move routines, we access mc.to, mc.from not under
1424          * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1425          */
1426         spin_lock(&mc.lock);
1427         from = mc.from;
1428         to = mc.to;
1429         if (!from)
1430                 goto unlock;
1431
1432         ret = mem_cgroup_same_or_subtree(memcg, from)
1433                 || mem_cgroup_same_or_subtree(memcg, to);
1434 unlock:
1435         spin_unlock(&mc.lock);
1436         return ret;
1437 }
1438
1439 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1440 {
1441         if (mc.moving_task && current != mc.moving_task) {
1442                 if (mem_cgroup_under_move(memcg)) {
1443                         DEFINE_WAIT(wait);
1444                         prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1445                         /* moving charge context might have finished. */
1446                         if (mc.moving_task)
1447                                 schedule();
1448                         finish_wait(&mc.waitq, &wait);
1449                         return true;
1450                 }
1451         }
1452         return false;
1453 }
1454
1455 /**
1456  * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1457  * @memcg: The memory cgroup that went over limit
1458  * @p: Task that is going to be killed
1459  *
1460  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1461  * enabled
1462  */
1463 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1464 {
1465         struct cgroup *task_cgrp;
1466         struct cgroup *mem_cgrp;
1467         /*
1468          * Need a buffer in BSS, can't rely on allocations. The code relies
1469          * on the assumption that OOM is serialized for memory controller.
1470          * If this assumption is broken, revisit this code.
1471          */
1472         static char memcg_name[PATH_MAX];
1473         int ret;
1474
1475         if (!memcg || !p)
1476                 return;
1477
1478
1479         rcu_read_lock();
1480
1481         mem_cgrp = memcg->css.cgroup;
1482         task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1483
1484         ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1485         if (ret < 0) {
1486                 /*
1487                  * Unfortunately, we are unable to convert to a useful name
1488                  * But we'll still print out the usage information
1489                  */
1490                 rcu_read_unlock();
1491                 goto done;
1492         }
1493         rcu_read_unlock();
1494
1495         printk(KERN_INFO "Task in %s killed", memcg_name);
1496
1497         rcu_read_lock();
1498         ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1499         if (ret < 0) {
1500                 rcu_read_unlock();
1501                 goto done;
1502         }
1503         rcu_read_unlock();
1504
1505         /*
1506          * Continues from above, so we don't need an KERN_ level
1507          */
1508         printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1509 done:
1510
1511         printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1512                 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1513                 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1514                 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1515         printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1516                 "failcnt %llu\n",
1517                 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1518                 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1519                 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1520 }
1521
1522 /*
1523  * This function returns the number of memcg under hierarchy tree. Returns
1524  * 1(self count) if no children.
1525  */
1526 static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1527 {
1528         int num = 0;
1529         struct mem_cgroup *iter;
1530
1531         for_each_mem_cgroup_tree(iter, memcg)
1532                 num++;
1533         return num;
1534 }
1535
1536 /*
1537  * Return the memory (and swap, if configured) limit for a memcg.
1538  */
1539 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1540 {
1541         u64 limit;
1542         u64 memsw;
1543
1544         limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1545         limit += total_swap_pages << PAGE_SHIFT;
1546
1547         memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1548         /*
1549          * If memsw is finite and limits the amount of swap space available
1550          * to this memcg, return that limit.
1551          */
1552         return min(limit, memsw);
1553 }
1554
1555 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1556                                         gfp_t gfp_mask,
1557                                         unsigned long flags)
1558 {
1559         unsigned long total = 0;
1560         bool noswap = false;
1561         int loop;
1562
1563         if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1564                 noswap = true;
1565         if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1566                 noswap = true;
1567
1568         for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1569                 if (loop)
1570                         drain_all_stock_async(memcg);
1571                 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1572                 /*
1573                  * Allow limit shrinkers, which are triggered directly
1574                  * by userspace, to catch signals and stop reclaim
1575                  * after minimal progress, regardless of the margin.
1576                  */
1577                 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1578                         break;
1579                 if (mem_cgroup_margin(memcg))
1580                         break;
1581                 /*
1582                  * If nothing was reclaimed after two attempts, there
1583                  * may be no reclaimable pages in this hierarchy.
1584                  */
1585                 if (loop && !total)
1586                         break;
1587         }
1588         return total;
1589 }
1590
1591 /**
1592  * test_mem_cgroup_node_reclaimable
1593  * @mem: the target memcg
1594  * @nid: the node ID to be checked.
1595  * @noswap : specify true here if the user wants flle only information.
1596  *
1597  * This function returns whether the specified memcg contains any
1598  * reclaimable pages on a node. Returns true if there are any reclaimable
1599  * pages in the node.
1600  */
1601 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1602                 int nid, bool noswap)
1603 {
1604         if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1605                 return true;
1606         if (noswap || !total_swap_pages)
1607                 return false;
1608         if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1609                 return true;
1610         return false;
1611
1612 }
1613 #if MAX_NUMNODES > 1
1614
1615 /*
1616  * Always updating the nodemask is not very good - even if we have an empty
1617  * list or the wrong list here, we can start from some node and traverse all
1618  * nodes based on the zonelist. So update the list loosely once per 10 secs.
1619  *
1620  */
1621 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1622 {
1623         int nid;
1624         /*
1625          * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1626          * pagein/pageout changes since the last update.
1627          */
1628         if (!atomic_read(&memcg->numainfo_events))
1629                 return;
1630         if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1631                 return;
1632
1633         /* make a nodemask where this memcg uses memory from */
1634         memcg->scan_nodes = node_states[N_HIGH_MEMORY];
1635
1636         for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1637
1638                 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1639                         node_clear(nid, memcg->scan_nodes);
1640         }
1641
1642         atomic_set(&memcg->numainfo_events, 0);
1643         atomic_set(&memcg->numainfo_updating, 0);
1644 }
1645
1646 /*
1647  * Selecting a node where we start reclaim from. Because what we need is just
1648  * reducing usage counter, start from anywhere is O,K. Considering
1649  * memory reclaim from current node, there are pros. and cons.
1650  *
1651  * Freeing memory from current node means freeing memory from a node which
1652  * we'll use or we've used. So, it may make LRU bad. And if several threads
1653  * hit limits, it will see a contention on a node. But freeing from remote
1654  * node means more costs for memory reclaim because of memory latency.
1655  *
1656  * Now, we use round-robin. Better algorithm is welcomed.
1657  */
1658 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1659 {
1660         int node;
1661
1662         mem_cgroup_may_update_nodemask(memcg);
1663         node = memcg->last_scanned_node;
1664
1665         node = next_node(node, memcg->scan_nodes);
1666         if (node == MAX_NUMNODES)
1667                 node = first_node(memcg->scan_nodes);
1668         /*
1669          * We call this when we hit limit, not when pages are added to LRU.
1670          * No LRU may hold pages because all pages are UNEVICTABLE or
1671          * memcg is too small and all pages are not on LRU. In that case,
1672          * we use curret node.
1673          */
1674         if (unlikely(node == MAX_NUMNODES))
1675                 node = numa_node_id();
1676
1677         memcg->last_scanned_node = node;
1678         return node;
1679 }
1680
1681 /*
1682  * Check all nodes whether it contains reclaimable pages or not.
1683  * For quick scan, we make use of scan_nodes. This will allow us to skip
1684  * unused nodes. But scan_nodes is lazily updated and may not cotain
1685  * enough new information. We need to do double check.
1686  */
1687 bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1688 {
1689         int nid;
1690
1691         /*
1692          * quick check...making use of scan_node.
1693          * We can skip unused nodes.
1694          */
1695         if (!nodes_empty(memcg->scan_nodes)) {
1696                 for (nid = first_node(memcg->scan_nodes);
1697                      nid < MAX_NUMNODES;
1698                      nid = next_node(nid, memcg->scan_nodes)) {
1699
1700                         if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1701                                 return true;
1702                 }
1703         }
1704         /*
1705          * Check rest of nodes.
1706          */
1707         for_each_node_state(nid, N_HIGH_MEMORY) {
1708                 if (node_isset(nid, memcg->scan_nodes))
1709                         continue;
1710                 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1711                         return true;
1712         }
1713         return false;
1714 }
1715
1716 #else
1717 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1718 {
1719         return 0;
1720 }
1721
1722 bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1723 {
1724         return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1725 }
1726 #endif
1727
1728 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1729                                    struct zone *zone,
1730                                    gfp_t gfp_mask,
1731                                    unsigned long *total_scanned)
1732 {
1733         struct mem_cgroup *victim = NULL;
1734         int total = 0;
1735         int loop = 0;
1736         unsigned long excess;
1737         unsigned long nr_scanned;
1738         struct mem_cgroup_reclaim_cookie reclaim = {
1739                 .zone = zone,
1740                 .priority = 0,
1741         };
1742
1743         excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
1744
1745         while (1) {
1746                 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1747                 if (!victim) {
1748                         loop++;
1749                         if (loop >= 2) {
1750                                 /*
1751                                  * If we have not been able to reclaim
1752                                  * anything, it might because there are
1753                                  * no reclaimable pages under this hierarchy
1754                                  */
1755                                 if (!total)
1756                                         break;
1757                                 /*
1758                                  * We want to do more targeted reclaim.
1759                                  * excess >> 2 is not to excessive so as to
1760                                  * reclaim too much, nor too less that we keep
1761                                  * coming back to reclaim from this cgroup
1762                                  */
1763                                 if (total >= (excess >> 2) ||
1764                                         (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1765                                         break;
1766                         }
1767                         continue;
1768                 }
1769                 if (!mem_cgroup_reclaimable(victim, false))
1770                         continue;
1771                 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1772                                                      zone, &nr_scanned);
1773                 *total_scanned += nr_scanned;
1774                 if (!res_counter_soft_limit_excess(&root_memcg->res))
1775                         break;
1776         }
1777         mem_cgroup_iter_break(root_memcg, victim);
1778         return total;
1779 }
1780
1781 /*
1782  * Check OOM-Killer is already running under our hierarchy.
1783  * If someone is running, return false.
1784  * Has to be called with memcg_oom_lock
1785  */
1786 static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1787 {
1788         struct mem_cgroup *iter, *failed = NULL;
1789
1790         for_each_mem_cgroup_tree(iter, memcg) {
1791                 if (iter->oom_lock) {
1792                         /*
1793                          * this subtree of our hierarchy is already locked
1794                          * so we cannot give a lock.
1795                          */
1796                         failed = iter;
1797                         mem_cgroup_iter_break(memcg, iter);
1798                         break;
1799                 } else
1800                         iter->oom_lock = true;
1801         }
1802
1803         if (!failed)
1804                 return true;
1805
1806         /*
1807          * OK, we failed to lock the whole subtree so we have to clean up
1808          * what we set up to the failing subtree
1809          */
1810         for_each_mem_cgroup_tree(iter, memcg) {
1811                 if (iter == failed) {
1812                         mem_cgroup_iter_break(memcg, iter);
1813                         break;
1814                 }
1815                 iter->oom_lock = false;
1816         }
1817         return false;
1818 }
1819
1820 /*
1821  * Has to be called with memcg_oom_lock
1822  */
1823 static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1824 {
1825         struct mem_cgroup *iter;
1826
1827         for_each_mem_cgroup_tree(iter, memcg)
1828                 iter->oom_lock = false;
1829         return 0;
1830 }
1831
1832 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1833 {
1834         struct mem_cgroup *iter;
1835
1836         for_each_mem_cgroup_tree(iter, memcg)
1837                 atomic_inc(&iter->under_oom);
1838 }
1839
1840 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1841 {
1842         struct mem_cgroup *iter;
1843
1844         /*
1845          * When a new child is created while the hierarchy is under oom,
1846          * mem_cgroup_oom_lock() may not be called. We have to use
1847          * atomic_add_unless() here.
1848          */
1849         for_each_mem_cgroup_tree(iter, memcg)
1850                 atomic_add_unless(&iter->under_oom, -1, 0);
1851 }
1852
1853 static DEFINE_SPINLOCK(memcg_oom_lock);
1854 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1855
1856 struct oom_wait_info {
1857         struct mem_cgroup *mem;
1858         wait_queue_t    wait;
1859 };
1860
1861 static int memcg_oom_wake_function(wait_queue_t *wait,
1862         unsigned mode, int sync, void *arg)
1863 {
1864         struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
1865                           *oom_wait_memcg;
1866         struct oom_wait_info *oom_wait_info;
1867
1868         oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1869         oom_wait_memcg = oom_wait_info->mem;
1870
1871         /*
1872          * Both of oom_wait_info->mem and wake_mem are stable under us.
1873          * Then we can use css_is_ancestor without taking care of RCU.
1874          */
1875         if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
1876                 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
1877                 return 0;
1878         return autoremove_wake_function(wait, mode, sync, arg);
1879 }
1880
1881 static void memcg_wakeup_oom(struct mem_cgroup *memcg)
1882 {
1883         /* for filtering, pass "memcg" as argument. */
1884         __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1885 }
1886
1887 static void memcg_oom_recover(struct mem_cgroup *memcg)
1888 {
1889         if (memcg && atomic_read(&memcg->under_oom))
1890                 memcg_wakeup_oom(memcg);
1891 }
1892
1893 /*
1894  * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1895  */
1896 bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1897 {
1898         struct oom_wait_info owait;
1899         bool locked, need_to_kill;
1900
1901         owait.mem = memcg;
1902         owait.wait.flags = 0;
1903         owait.wait.func = memcg_oom_wake_function;
1904         owait.wait.private = current;
1905         INIT_LIST_HEAD(&owait.wait.task_list);
1906         need_to_kill = true;
1907         mem_cgroup_mark_under_oom(memcg);
1908
1909         /* At first, try to OOM lock hierarchy under memcg.*/
1910         spin_lock(&memcg_oom_lock);
1911         locked = mem_cgroup_oom_lock(memcg);
1912         /*
1913          * Even if signal_pending(), we can't quit charge() loop without
1914          * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1915          * under OOM is always welcomed, use TASK_KILLABLE here.
1916          */
1917         prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1918         if (!locked || memcg->oom_kill_disable)
1919                 need_to_kill = false;
1920         if (locked)
1921                 mem_cgroup_oom_notify(memcg);
1922         spin_unlock(&memcg_oom_lock);
1923
1924         if (need_to_kill) {
1925                 finish_wait(&memcg_oom_waitq, &owait.wait);
1926                 mem_cgroup_out_of_memory(memcg, mask);
1927         } else {
1928                 schedule();
1929                 finish_wait(&memcg_oom_waitq, &owait.wait);
1930         }
1931         spin_lock(&memcg_oom_lock);
1932         if (locked)
1933                 mem_cgroup_oom_unlock(memcg);
1934         memcg_wakeup_oom(memcg);
1935         spin_unlock(&memcg_oom_lock);
1936
1937         mem_cgroup_unmark_under_oom(memcg);
1938
1939         if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1940                 return false;
1941         /* Give chance to dying process */
1942         schedule_timeout_uninterruptible(1);
1943         return true;
1944 }
1945
1946 /*
1947  * Currently used to update mapped file statistics, but the routine can be
1948  * generalized to update other statistics as well.
1949  *
1950  * Notes: Race condition
1951  *
1952  * We usually use page_cgroup_lock() for accessing page_cgroup member but
1953  * it tends to be costly. But considering some conditions, we doesn't need
1954  * to do so _always_.
1955  *
1956  * Considering "charge", lock_page_cgroup() is not required because all
1957  * file-stat operations happen after a page is attached to radix-tree. There
1958  * are no race with "charge".
1959  *
1960  * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
1961  * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
1962  * if there are race with "uncharge". Statistics itself is properly handled
1963  * by flags.
1964  *
1965  * Considering "move", this is an only case we see a race. To make the race
1966  * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
1967  * possibility of race condition. If there is, we take a lock.
1968  */
1969
1970 void mem_cgroup_update_page_stat(struct page *page,
1971                                  enum mem_cgroup_page_stat_item idx, int val)
1972 {
1973         struct mem_cgroup *memcg;
1974         struct page_cgroup *pc = lookup_page_cgroup(page);
1975         bool need_unlock = false;
1976         unsigned long uninitialized_var(flags);
1977
1978         if (unlikely(!pc))
1979                 return;
1980
1981         rcu_read_lock();
1982         memcg = pc->mem_cgroup;
1983         if (unlikely(!memcg || !PageCgroupUsed(pc)))
1984                 goto out;
1985         /* pc->mem_cgroup is unstable ? */
1986         if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
1987                 /* take a lock against to access pc->mem_cgroup */
1988                 move_lock_page_cgroup(pc, &flags);
1989                 need_unlock = true;
1990                 memcg = pc->mem_cgroup;
1991                 if (!memcg || !PageCgroupUsed(pc))
1992                         goto out;
1993         }
1994
1995         switch (idx) {
1996         case MEMCG_NR_FILE_MAPPED:
1997                 if (val > 0)
1998                         SetPageCgroupFileMapped(pc);
1999                 else if (!page_mapped(page))
2000                         ClearPageCgroupFileMapped(pc);
2001                 idx = MEM_CGROUP_STAT_FILE_MAPPED;
2002                 break;
2003         default:
2004                 BUG();
2005         }
2006
2007         this_cpu_add(memcg->stat->count[idx], val);
2008
2009 out:
2010         if (unlikely(need_unlock))
2011                 move_unlock_page_cgroup(pc, &flags);
2012         rcu_read_unlock();
2013         return;
2014 }
2015 EXPORT_SYMBOL(mem_cgroup_update_page_stat);
2016
2017 /*
2018  * size of first charge trial. "32" comes from vmscan.c's magic value.
2019  * TODO: maybe necessary to use big numbers in big irons.
2020  */
2021 #define CHARGE_BATCH    32U
2022 struct memcg_stock_pcp {
2023         struct mem_cgroup *cached; /* this never be root cgroup */
2024         unsigned int nr_pages;
2025         struct work_struct work;
2026         unsigned long flags;
2027 #define FLUSHING_CACHED_CHARGE  (0)
2028 };
2029 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2030 static DEFINE_MUTEX(percpu_charge_mutex);
2031
2032 /*
2033  * Try to consume stocked charge on this cpu. If success, one page is consumed
2034  * from local stock and true is returned. If the stock is 0 or charges from a
2035  * cgroup which is not current target, returns false. This stock will be
2036  * refilled.
2037  */
2038 static bool consume_stock(struct mem_cgroup *memcg)
2039 {
2040         struct memcg_stock_pcp *stock;
2041         bool ret = true;
2042
2043         stock = &get_cpu_var(memcg_stock);
2044         if (memcg == stock->cached && stock->nr_pages)
2045                 stock->nr_pages--;
2046         else /* need to call res_counter_charge */
2047                 ret = false;
2048         put_cpu_var(memcg_stock);
2049         return ret;
2050 }
2051
2052 /*
2053  * Returns stocks cached in percpu to res_counter and reset cached information.
2054  */
2055 static void drain_stock(struct memcg_stock_pcp *stock)
2056 {
2057         struct mem_cgroup *old = stock->cached;
2058
2059         if (stock->nr_pages) {
2060                 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2061
2062                 res_counter_uncharge(&old->res, bytes);
2063                 if (do_swap_account)
2064                         res_counter_uncharge(&old->memsw, bytes);
2065                 stock->nr_pages = 0;
2066         }
2067         stock->cached = NULL;
2068 }
2069
2070 /*
2071  * This must be called under preempt disabled or must be called by
2072  * a thread which is pinned to local cpu.
2073  */
2074 static void drain_local_stock(struct work_struct *dummy)
2075 {
2076         struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2077         drain_stock(stock);
2078         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2079 }
2080
2081 /*
2082  * Cache charges(val) which is from res_counter, to local per_cpu area.
2083  * This will be consumed by consume_stock() function, later.
2084  */
2085 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2086 {
2087         struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2088
2089         if (stock->cached != memcg) { /* reset if necessary */
2090                 drain_stock(stock);
2091                 stock->cached = memcg;
2092         }
2093         stock->nr_pages += nr_pages;
2094         put_cpu_var(memcg_stock);
2095 }
2096
2097 /*
2098  * Drains all per-CPU charge caches for given root_memcg resp. subtree
2099  * of the hierarchy under it. sync flag says whether we should block
2100  * until the work is done.
2101  */
2102 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2103 {
2104         int cpu, curcpu;
2105
2106         /* Notify other cpus that system-wide "drain" is running */
2107         get_online_cpus();
2108         curcpu = get_cpu();
2109         for_each_online_cpu(cpu) {
2110                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2111                 struct mem_cgroup *memcg;
2112
2113                 memcg = stock->cached;
2114                 if (!memcg || !stock->nr_pages)
2115                         continue;
2116                 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2117                         continue;
2118                 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2119                         if (cpu == curcpu)
2120                                 drain_local_stock(&stock->work);
2121                         else
2122                                 schedule_work_on(cpu, &stock->work);
2123                 }
2124         }
2125         put_cpu();
2126
2127         if (!sync)
2128                 goto out;
2129
2130         for_each_online_cpu(cpu) {
2131                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2132                 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2133                         flush_work(&stock->work);
2134         }
2135 out:
2136         put_online_cpus();
2137 }
2138
2139 /*
2140  * Tries to drain stocked charges in other cpus. This function is asynchronous
2141  * and just put a work per cpu for draining localy on each cpu. Caller can
2142  * expects some charges will be back to res_counter later but cannot wait for
2143  * it.
2144  */
2145 static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2146 {
2147         /*
2148          * If someone calls draining, avoid adding more kworker runs.
2149          */
2150         if (!mutex_trylock(&percpu_charge_mutex))
2151                 return;
2152         drain_all_stock(root_memcg, false);
2153         mutex_unlock(&percpu_charge_mutex);
2154 }
2155
2156 /* This is a synchronous drain interface. */
2157 static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2158 {
2159         /* called when force_empty is called */
2160         mutex_lock(&percpu_charge_mutex);
2161         drain_all_stock(root_memcg, true);
2162         mutex_unlock(&percpu_charge_mutex);
2163 }
2164
2165 /*
2166  * This function drains percpu counter value from DEAD cpu and
2167  * move it to local cpu. Note that this function can be preempted.
2168  */
2169 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2170 {
2171         int i;
2172
2173         spin_lock(&memcg->pcp_counter_lock);
2174         for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
2175                 long x = per_cpu(memcg->stat->count[i], cpu);
2176
2177                 per_cpu(memcg->stat->count[i], cpu) = 0;
2178                 memcg->nocpu_base.count[i] += x;
2179         }
2180         for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2181                 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2182
2183                 per_cpu(memcg->stat->events[i], cpu) = 0;
2184                 memcg->nocpu_base.events[i] += x;
2185         }
2186         /* need to clear ON_MOVE value, works as a kind of lock. */
2187         per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
2188         spin_unlock(&memcg->pcp_counter_lock);
2189 }
2190
2191 static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
2192 {
2193         int idx = MEM_CGROUP_ON_MOVE;
2194
2195         spin_lock(&memcg->pcp_counter_lock);
2196         per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
2197         spin_unlock(&memcg->pcp_counter_lock);
2198 }
2199
2200 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2201                                         unsigned long action,
2202                                         void *hcpu)
2203 {
2204         int cpu = (unsigned long)hcpu;
2205         struct memcg_stock_pcp *stock;
2206         struct mem_cgroup *iter;
2207
2208         if ((action == CPU_ONLINE)) {
2209                 for_each_mem_cgroup(iter)
2210                         synchronize_mem_cgroup_on_move(iter, cpu);
2211                 return NOTIFY_OK;
2212         }
2213
2214         if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2215                 return NOTIFY_OK;
2216
2217         for_each_mem_cgroup(iter)
2218                 mem_cgroup_drain_pcp_counter(iter, cpu);
2219
2220         stock = &per_cpu(memcg_stock, cpu);
2221         drain_stock(stock);
2222         return NOTIFY_OK;
2223 }
2224
2225
2226 /* See __mem_cgroup_try_charge() for details */
2227 enum {
2228         CHARGE_OK,              /* success */
2229         CHARGE_RETRY,           /* need to retry but retry is not bad */
2230         CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
2231         CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
2232         CHARGE_OOM_DIE,         /* the current is killed because of OOM */
2233 };
2234
2235 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2236                                 unsigned int nr_pages, bool oom_check)
2237 {
2238         unsigned long csize = nr_pages * PAGE_SIZE;
2239         struct mem_cgroup *mem_over_limit;
2240         struct res_counter *fail_res;
2241         unsigned long flags = 0;
2242         int ret;
2243
2244         ret = res_counter_charge(&memcg->res, csize, &fail_res);
2245
2246         if (likely(!ret)) {
2247                 if (!do_swap_account)
2248                         return CHARGE_OK;
2249                 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2250                 if (likely(!ret))
2251                         return CHARGE_OK;
2252
2253                 res_counter_uncharge(&memcg->res, csize);
2254                 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2255                 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2256         } else
2257                 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2258         /*
2259          * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
2260          * of regular pages (CHARGE_BATCH), or a single regular page (1).
2261          *
2262          * Never reclaim on behalf of optional batching, retry with a
2263          * single page instead.
2264          */
2265         if (nr_pages == CHARGE_BATCH)
2266                 return CHARGE_RETRY;
2267
2268         if (!(gfp_mask & __GFP_WAIT))
2269                 return CHARGE_WOULDBLOCK;
2270
2271         ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2272         if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2273                 return CHARGE_RETRY;
2274         /*
2275          * Even though the limit is exceeded at this point, reclaim
2276          * may have been able to free some pages.  Retry the charge
2277          * before killing the task.
2278          *
2279          * Only for regular pages, though: huge pages are rather
2280          * unlikely to succeed so close to the limit, and we fall back
2281          * to regular pages anyway in case of failure.
2282          */
2283         if (nr_pages == 1 && ret)
2284                 return CHARGE_RETRY;
2285
2286         /*
2287          * At task move, charge accounts can be doubly counted. So, it's
2288          * better to wait until the end of task_move if something is going on.
2289          */
2290         if (mem_cgroup_wait_acct_move(mem_over_limit))
2291                 return CHARGE_RETRY;
2292
2293         /* If we don't need to call oom-killer at el, return immediately */
2294         if (!oom_check)
2295                 return CHARGE_NOMEM;
2296         /* check OOM */
2297         if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
2298                 return CHARGE_OOM_DIE;
2299
2300         return CHARGE_RETRY;
2301 }
2302
2303 /*
2304  * Unlike exported interface, "oom" parameter is added. if oom==true,
2305  * oom-killer can be invoked.
2306  */
2307 static int __mem_cgroup_try_charge(struct mm_struct *mm,
2308                                    gfp_t gfp_mask,
2309                                    unsigned int nr_pages,
2310                                    struct mem_cgroup **ptr,
2311                                    bool oom)
2312 {
2313         unsigned int batch = max(CHARGE_BATCH, nr_pages);
2314         int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2315         struct mem_cgroup *memcg = NULL;
2316         int ret;
2317
2318         /*
2319          * Unlike gloval-vm's OOM-kill, we're not in memory shortage
2320          * in system level. So, allow to go ahead dying process in addition to
2321          * MEMDIE process.
2322          */
2323         if (unlikely(test_thread_flag(TIF_MEMDIE)
2324                      || fatal_signal_pending(current)))
2325                 goto bypass;
2326
2327         /*
2328          * We always charge the cgroup the mm_struct belongs to.
2329          * The mm_struct's mem_cgroup changes on task migration if the
2330          * thread group leader migrates. It's possible that mm is not
2331          * set, if so charge the init_mm (happens for pagecache usage).
2332          */
2333         if (!*ptr && !mm)
2334                 goto bypass;
2335 again:
2336         if (*ptr) { /* css should be a valid one */
2337                 memcg = *ptr;
2338                 VM_BUG_ON(css_is_removed(&memcg->css));
2339                 if (mem_cgroup_is_root(memcg))
2340                         goto done;
2341                 if (nr_pages == 1 && consume_stock(memcg))
2342                         goto done;
2343                 css_get(&memcg->css);
2344         } else {
2345                 struct task_struct *p;
2346
2347                 rcu_read_lock();
2348                 p = rcu_dereference(mm->owner);
2349                 /*
2350                  * Because we don't have task_lock(), "p" can exit.
2351                  * In that case, "memcg" can point to root or p can be NULL with
2352                  * race with swapoff. Then, we have small risk of mis-accouning.
2353                  * But such kind of mis-account by race always happens because
2354                  * we don't have cgroup_mutex(). It's overkill and we allo that
2355                  * small race, here.
2356                  * (*) swapoff at el will charge against mm-struct not against
2357                  * task-struct. So, mm->owner can be NULL.
2358                  */
2359                 memcg = mem_cgroup_from_task(p);
2360                 if (!memcg || mem_cgroup_is_root(memcg)) {
2361                         rcu_read_unlock();
2362                         goto done;
2363                 }
2364                 if (nr_pages == 1 && consume_stock(memcg)) {
2365                         /*
2366                          * It seems dagerous to access memcg without css_get().
2367                          * But considering how consume_stok works, it's not
2368                          * necessary. If consume_stock success, some charges
2369                          * from this memcg are cached on this cpu. So, we
2370                          * don't need to call css_get()/css_tryget() before
2371                          * calling consume_stock().
2372                          */
2373                         rcu_read_unlock();
2374                         goto done;
2375                 }
2376                 /* after here, we may be blocked. we need to get refcnt */
2377                 if (!css_tryget(&memcg->css)) {
2378                         rcu_read_unlock();
2379                         goto again;
2380                 }
2381                 rcu_read_unlock();
2382         }
2383
2384         do {
2385                 bool oom_check;
2386
2387                 /* If killed, bypass charge */
2388                 if (fatal_signal_pending(current)) {
2389                         css_put(&memcg->css);
2390                         goto bypass;
2391                 }
2392
2393                 oom_check = false;
2394                 if (oom && !nr_oom_retries) {
2395                         oom_check = true;
2396                         nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2397                 }
2398
2399                 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
2400                 switch (ret) {
2401                 case CHARGE_OK:
2402                         break;
2403                 case CHARGE_RETRY: /* not in OOM situation but retry */
2404                         batch = nr_pages;
2405                         css_put(&memcg->css);
2406                         memcg = NULL;
2407                         goto again;
2408                 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2409                         css_put(&memcg->css);
2410                         goto nomem;
2411                 case CHARGE_NOMEM: /* OOM routine works */
2412                         if (!oom) {
2413                                 css_put(&memcg->css);
2414                                 goto nomem;
2415                         }
2416                         /* If oom, we never return -ENOMEM */
2417                         nr_oom_retries--;
2418                         break;
2419                 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
2420                         css_put(&memcg->css);
2421                         goto bypass;
2422                 }
2423         } while (ret != CHARGE_OK);
2424
2425         if (batch > nr_pages)
2426                 refill_stock(memcg, batch - nr_pages);
2427         css_put(&memcg->css);
2428 done:
2429         *ptr = memcg;
2430         return 0;
2431 nomem:
2432         *ptr = NULL;
2433         return -ENOMEM;
2434 bypass:
2435         *ptr = NULL;
2436         return 0;
2437 }
2438
2439 /*
2440  * Somemtimes we have to undo a charge we got by try_charge().
2441  * This function is for that and do uncharge, put css's refcnt.
2442  * gotten by try_charge().
2443  */
2444 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2445                                        unsigned int nr_pages)
2446 {
2447         if (!mem_cgroup_is_root(memcg)) {
2448                 unsigned long bytes = nr_pages * PAGE_SIZE;
2449
2450                 res_counter_uncharge(&memcg->res, bytes);
2451                 if (do_swap_account)
2452                         res_counter_uncharge(&memcg->memsw, bytes);
2453         }
2454 }
2455
2456 /*
2457  * A helper function to get mem_cgroup from ID. must be called under
2458  * rcu_read_lock(). The caller must check css_is_removed() or some if
2459  * it's concern. (dropping refcnt from swap can be called against removed
2460  * memcg.)
2461  */
2462 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2463 {
2464         struct cgroup_subsys_state *css;
2465
2466         /* ID 0 is unused ID */
2467         if (!id)
2468                 return NULL;
2469         css = css_lookup(&mem_cgroup_subsys, id);
2470         if (!css)
2471                 return NULL;
2472         return container_of(css, struct mem_cgroup, css);
2473 }
2474
2475 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2476 {
2477         struct mem_cgroup *memcg = NULL;
2478         struct page_cgroup *pc;
2479         unsigned short id;
2480         swp_entry_t ent;
2481
2482         VM_BUG_ON(!PageLocked(page));
2483
2484         pc = lookup_page_cgroup(page);
2485         lock_page_cgroup(pc);
2486         if (PageCgroupUsed(pc)) {
2487                 memcg = pc->mem_cgroup;
2488                 if (memcg && !css_tryget(&memcg->css))
2489                         memcg = NULL;
2490         } else if (PageSwapCache(page)) {
2491                 ent.val = page_private(page);
2492                 id = lookup_swap_cgroup(ent);
2493                 rcu_read_lock();
2494                 memcg = mem_cgroup_lookup(id);
2495                 if (memcg && !css_tryget(&memcg->css))
2496                         memcg = NULL;
2497                 rcu_read_unlock();
2498         }
2499         unlock_page_cgroup(pc);
2500         return memcg;
2501 }
2502
2503 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2504                                        struct page *page,
2505                                        unsigned int nr_pages,
2506                                        struct page_cgroup *pc,
2507                                        enum charge_type ctype)
2508 {
2509         lock_page_cgroup(pc);
2510         if (unlikely(PageCgroupUsed(pc))) {
2511                 unlock_page_cgroup(pc);
2512                 __mem_cgroup_cancel_charge(memcg, nr_pages);
2513                 return;
2514         }
2515         /*
2516          * we don't need page_cgroup_lock about tail pages, becase they are not
2517          * accessed by any other context at this point.
2518          */
2519         pc->mem_cgroup = memcg;
2520         /*
2521          * We access a page_cgroup asynchronously without lock_page_cgroup().
2522          * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
2523          * is accessed after testing USED bit. To make pc->mem_cgroup visible
2524          * before USED bit, we need memory barrier here.
2525          * See mem_cgroup_add_lru_list(), etc.
2526          */
2527         smp_wmb();
2528         switch (ctype) {
2529         case MEM_CGROUP_CHARGE_TYPE_CACHE:
2530         case MEM_CGROUP_CHARGE_TYPE_SHMEM:
2531                 SetPageCgroupCache(pc);
2532                 SetPageCgroupUsed(pc);
2533                 break;
2534         case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2535                 ClearPageCgroupCache(pc);
2536                 SetPageCgroupUsed(pc);
2537                 break;
2538         default:
2539                 break;
2540         }
2541
2542         mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
2543         unlock_page_cgroup(pc);
2544         /*
2545          * "charge_statistics" updated event counter. Then, check it.
2546          * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2547          * if they exceeds softlimit.
2548          */
2549         memcg_check_events(memcg, page);
2550 }
2551
2552 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2553
2554 #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2555                         (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
2556 /*
2557  * Because tail pages are not marked as "used", set it. We're under
2558  * zone->lru_lock, 'splitting on pmd' and compound_lock.
2559  * charge/uncharge will be never happen and move_account() is done under
2560  * compound_lock(), so we don't have to take care of races.
2561  */
2562 void mem_cgroup_split_huge_fixup(struct page *head)
2563 {
2564         struct page_cgroup *head_pc = lookup_page_cgroup(head);
2565         struct page_cgroup *pc;
2566         int i;
2567
2568         if (mem_cgroup_disabled())
2569                 return;
2570         for (i = 1; i < HPAGE_PMD_NR; i++) {
2571                 pc = head_pc + i;
2572                 pc->mem_cgroup = head_pc->mem_cgroup;
2573                 smp_wmb();/* see __commit_charge() */
2574                 /*
2575                  * LRU flags cannot be copied because we need to add tail
2576                  * page to LRU by generic call and our hooks will be called.
2577                  */
2578                 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2579         }
2580
2581         if (PageCgroupAcctLRU(head_pc)) {
2582                 enum lru_list lru;
2583                 struct mem_cgroup_per_zone *mz;
2584                 /*
2585                  * We hold lru_lock, then, reduce counter directly.
2586                  */
2587                 lru = page_lru(head);
2588                 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
2589                 MEM_CGROUP_ZSTAT(mz, lru) -= HPAGE_PMD_NR - 1;
2590         }
2591 }
2592 #endif
2593
2594 /**
2595  * mem_cgroup_move_account - move account of the page
2596  * @page: the page
2597  * @nr_pages: number of regular pages (>1 for huge pages)
2598  * @pc: page_cgroup of the page.
2599  * @from: mem_cgroup which the page is moved from.
2600  * @to: mem_cgroup which the page is moved to. @from != @to.
2601  * @uncharge: whether we should call uncharge and css_put against @from.
2602  *
2603  * The caller must confirm following.
2604  * - page is not on LRU (isolate_page() is useful.)
2605  * - compound_lock is held when nr_pages > 1
2606  *
2607  * This function doesn't do "charge" nor css_get to new cgroup. It should be
2608  * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
2609  * true, this function does "uncharge" from old cgroup, but it doesn't if
2610  * @uncharge is false, so a caller should do "uncharge".
2611  */
2612 static int mem_cgroup_move_account(struct page *page,
2613                                    unsigned int nr_pages,
2614                                    struct page_cgroup *pc,
2615                                    struct mem_cgroup *from,
2616                                    struct mem_cgroup *to,
2617                                    bool uncharge)
2618 {
2619         unsigned long flags;
2620         int ret;
2621
2622         VM_BUG_ON(from == to);
2623         VM_BUG_ON(PageLRU(page));
2624         /*
2625          * The page is isolated from LRU. So, collapse function
2626          * will not handle this page. But page splitting can happen.
2627          * Do this check under compound_page_lock(). The caller should
2628          * hold it.
2629          */
2630         ret = -EBUSY;
2631         if (nr_pages > 1 && !PageTransHuge(page))
2632                 goto out;
2633
2634         lock_page_cgroup(pc);
2635
2636         ret = -EINVAL;
2637         if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2638                 goto unlock;
2639
2640         move_lock_page_cgroup(pc, &flags);
2641
2642         if (PageCgroupFileMapped(pc)) {
2643                 /* Update mapped_file data for mem_cgroup */
2644                 preempt_disable();
2645                 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2646                 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2647                 preempt_enable();
2648         }
2649         mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2650         if (uncharge)
2651                 /* This is not "cancel", but cancel_charge does all we need. */
2652                 __mem_cgroup_cancel_charge(from, nr_pages);
2653
2654         /* caller should have done css_get */
2655         pc->mem_cgroup = to;
2656         mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
2657         /*
2658          * We charges against "to" which may not have any tasks. Then, "to"
2659          * can be under rmdir(). But in current implementation, caller of
2660          * this function is just force_empty() and move charge, so it's
2661          * guaranteed that "to" is never removed. So, we don't check rmdir
2662          * status here.
2663          */
2664         move_unlock_page_cgroup(pc, &flags);
2665         ret = 0;
2666 unlock:
2667         unlock_page_cgroup(pc);
2668         /*
2669          * check events
2670          */
2671         memcg_check_events(to, page);
2672         memcg_check_events(from, page);
2673 out:
2674         return ret;
2675 }
2676
2677 /*
2678  * move charges to its parent.
2679  */
2680
2681 static int mem_cgroup_move_parent(struct page *page,
2682                                   struct page_cgroup *pc,
2683                                   struct mem_cgroup *child,
2684                                   gfp_t gfp_mask)
2685 {
2686         struct cgroup *cg = child->css.cgroup;
2687         struct cgroup *pcg = cg->parent;
2688         struct mem_cgroup *parent;
2689         unsigned int nr_pages;
2690         unsigned long uninitialized_var(flags);
2691         int ret;
2692
2693         /* Is ROOT ? */
2694         if (!pcg)
2695                 return -EINVAL;
2696
2697         ret = -EBUSY;
2698         if (!get_page_unless_zero(page))
2699                 goto out;
2700         if (isolate_lru_page(page))
2701                 goto put;
2702
2703         nr_pages = hpage_nr_pages(page);
2704
2705         parent = mem_cgroup_from_cont(pcg);
2706         ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
2707         if (ret || !parent)
2708                 goto put_back;
2709
2710         if (nr_pages > 1)
2711                 flags = compound_lock_irqsave(page);
2712
2713         ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
2714         if (ret)
2715                 __mem_cgroup_cancel_charge(parent, nr_pages);
2716
2717         if (nr_pages > 1)
2718                 compound_unlock_irqrestore(page, flags);
2719 put_back:
2720         putback_lru_page(page);
2721 put:
2722         put_page(page);
2723 out:
2724         return ret;
2725 }
2726
2727 /*
2728  * Charge the memory controller for page usage.
2729  * Return
2730  * 0 if the charge was successful
2731  * < 0 if the cgroup is over its limit
2732  */
2733 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2734                                 gfp_t gfp_mask, enum charge_type ctype)
2735 {
2736         struct mem_cgroup *memcg = NULL;
2737         unsigned int nr_pages = 1;
2738         struct page_cgroup *pc;
2739         bool oom = true;
2740         int ret;
2741
2742         if (PageTransHuge(page)) {
2743                 nr_pages <<= compound_order(page);
2744                 VM_BUG_ON(!PageTransHuge(page));
2745                 /*
2746                  * Never OOM-kill a process for a huge page.  The
2747                  * fault handler will fall back to regular pages.
2748                  */
2749                 oom = false;
2750         }
2751
2752         pc = lookup_page_cgroup(page);
2753         BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
2754
2755         ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2756         if (ret || !memcg)
2757                 return ret;
2758
2759         __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
2760         return 0;
2761 }
2762
2763 int mem_cgroup_newpage_charge(struct page *page,
2764                               struct mm_struct *mm, gfp_t gfp_mask)
2765 {
2766         if (mem_cgroup_disabled())
2767                 return 0;
2768         /*
2769          * If already mapped, we don't have to account.
2770          * If page cache, page->mapping has address_space.
2771          * But page->mapping may have out-of-use anon_vma pointer,
2772          * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
2773          * is NULL.
2774          */
2775         if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2776                 return 0;
2777         if (unlikely(!mm))
2778                 mm = &init_mm;
2779         return mem_cgroup_charge_common(page, mm, gfp_mask,
2780                                 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2781 }
2782
2783 static void
2784 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2785                                         enum charge_type ctype);
2786
2787 static void
2788 __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
2789                                         enum charge_type ctype)
2790 {
2791         struct page_cgroup *pc = lookup_page_cgroup(page);
2792         /*
2793          * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2794          * is already on LRU. It means the page may on some other page_cgroup's
2795          * LRU. Take care of it.
2796          */
2797         mem_cgroup_lru_del_before_commit(page);
2798         __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
2799         mem_cgroup_lru_add_after_commit(page);
2800         return;
2801 }
2802
2803 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2804                                 gfp_t gfp_mask)
2805 {
2806         struct mem_cgroup *memcg = NULL;
2807         int ret;
2808
2809         if (mem_cgroup_disabled())
2810                 return 0;
2811         if (PageCompound(page))
2812                 return 0;
2813
2814         if (unlikely(!mm))
2815                 mm = &init_mm;
2816
2817         if (page_is_file_cache(page)) {
2818                 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true);
2819                 if (ret || !memcg)
2820                         return ret;
2821
2822                 /*
2823                  * FUSE reuses pages without going through the final
2824                  * put that would remove them from the LRU list, make
2825                  * sure that they get relinked properly.
2826                  */
2827                 __mem_cgroup_commit_charge_lrucare(page, memcg,
2828                                         MEM_CGROUP_CHARGE_TYPE_CACHE);
2829                 return ret;
2830         }
2831         /* shmem */
2832         if (PageSwapCache(page)) {
2833                 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
2834                 if (!ret)
2835                         __mem_cgroup_commit_charge_swapin(page, memcg,
2836                                         MEM_CGROUP_CHARGE_TYPE_SHMEM);
2837         } else
2838                 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2839                                         MEM_CGROUP_CHARGE_TYPE_SHMEM);
2840
2841         return ret;
2842 }
2843
2844 /*
2845  * While swap-in, try_charge -> commit or cancel, the page is locked.
2846  * And when try_charge() successfully returns, one refcnt to memcg without
2847  * struct page_cgroup is acquired. This refcnt will be consumed by
2848  * "commit()" or removed by "cancel()"
2849  */
2850 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2851                                  struct page *page,
2852                                  gfp_t mask, struct mem_cgroup **ptr)
2853 {
2854         struct mem_cgroup *memcg;
2855         int ret;
2856
2857         *ptr = NULL;
2858
2859         if (mem_cgroup_disabled())
2860                 return 0;
2861
2862         if (!do_swap_account)
2863                 goto charge_cur_mm;
2864         /*
2865          * A racing thread's fault, or swapoff, may have already updated
2866          * the pte, and even removed page from swap cache: in those cases
2867          * do_swap_page()'s pte_same() test will fail; but there's also a
2868          * KSM case which does need to charge the page.
2869          */
2870         if (!PageSwapCache(page))
2871                 goto charge_cur_mm;
2872         memcg = try_get_mem_cgroup_from_page(page);
2873         if (!memcg)
2874                 goto charge_cur_mm;
2875         *ptr = memcg;
2876         ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
2877         css_put(&memcg->css);
2878         return ret;
2879 charge_cur_mm:
2880         if (unlikely(!mm))
2881                 mm = &init_mm;
2882         return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
2883 }
2884
2885 static void
2886 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2887                                         enum charge_type ctype)
2888 {
2889         if (mem_cgroup_disabled())
2890                 return;
2891         if (!ptr)
2892                 return;
2893         cgroup_exclude_rmdir(&ptr->css);
2894
2895         __mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
2896         /*
2897          * Now swap is on-memory. This means this page may be
2898          * counted both as mem and swap....double count.
2899          * Fix it by uncharging from memsw. Basically, this SwapCache is stable
2900          * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
2901          * may call delete_from_swap_cache() before reach here.
2902          */
2903         if (do_swap_account && PageSwapCache(page)) {
2904                 swp_entry_t ent = {.val = page_private(page)};
2905                 unsigned short id;
2906                 struct mem_cgroup *memcg;
2907
2908                 id = swap_cgroup_record(ent, 0);
2909                 rcu_read_lock();
2910                 memcg = mem_cgroup_lookup(id);
2911                 if (memcg) {
2912                         /*
2913                          * This recorded memcg can be obsolete one. So, avoid
2914                          * calling css_tryget
2915                          */
2916                         if (!mem_cgroup_is_root(memcg))
2917                                 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2918                         mem_cgroup_swap_statistics(memcg, false);
2919                         mem_cgroup_put(memcg);
2920                 }
2921                 rcu_read_unlock();
2922         }
2923         /*
2924          * At swapin, we may charge account against cgroup which has no tasks.
2925          * So, rmdir()->pre_destroy() can be called while we do this charge.
2926          * In that case, we need to call pre_destroy() again. check it here.
2927          */
2928         cgroup_release_and_wakeup_rmdir(&ptr->css);
2929 }
2930
2931 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
2932 {
2933         __mem_cgroup_commit_charge_swapin(page, ptr,
2934                                         MEM_CGROUP_CHARGE_TYPE_MAPPED);
2935 }
2936
2937 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
2938 {
2939         if (mem_cgroup_disabled())
2940                 return;
2941         if (!memcg)
2942                 return;
2943         __mem_cgroup_cancel_charge(memcg, 1);
2944 }
2945
2946 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
2947                                    unsigned int nr_pages,
2948                                    const enum charge_type ctype)
2949 {
2950         struct memcg_batch_info *batch = NULL;
2951         bool uncharge_memsw = true;
2952
2953         /* If swapout, usage of swap doesn't decrease */
2954         if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2955                 uncharge_memsw = false;
2956
2957         batch = &current->memcg_batch;
2958         /*
2959          * In usual, we do css_get() when we remember memcg pointer.
2960          * But in this case, we keep res->usage until end of a series of
2961          * uncharges. Then, it's ok to ignore memcg's refcnt.
2962          */
2963         if (!batch->memcg)
2964                 batch->memcg = memcg;
2965         /*
2966          * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2967          * In those cases, all pages freed continuously can be expected to be in
2968          * the same cgroup and we have chance to coalesce uncharges.
2969          * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2970          * because we want to do uncharge as soon as possible.
2971          */
2972
2973         if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2974                 goto direct_uncharge;
2975
2976         if (nr_pages > 1)
2977                 goto direct_uncharge;
2978
2979         /*
2980          * In typical case, batch->memcg == mem. This means we can
2981          * merge a series of uncharges to an uncharge of res_counter.
2982          * If not, we uncharge res_counter ony by one.
2983          */
2984         if (batch->memcg != memcg)
2985                 goto direct_uncharge;
2986         /* remember freed charge and uncharge it later */
2987         batch->nr_pages++;
2988         if (uncharge_memsw)
2989                 batch->memsw_nr_pages++;
2990         return;
2991 direct_uncharge:
2992         res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
2993         if (uncharge_memsw)
2994                 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
2995         if (unlikely(batch->memcg != memcg))
2996                 memcg_oom_recover(memcg);
2997         return;
2998 }
2999
3000 /*
3001  * uncharge if !page_mapped(page)
3002  */
3003 static struct mem_cgroup *
3004 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3005 {
3006         struct mem_cgroup *memcg = NULL;
3007         unsigned int nr_pages = 1;
3008         struct page_cgroup *pc;
3009
3010         if (mem_cgroup_disabled())
3011                 return NULL;
3012
3013         if (PageSwapCache(page))
3014                 return NULL;
3015
3016         if (PageTransHuge(page)) {
3017                 nr_pages <<= compound_order(page);
3018                 VM_BUG_ON(!PageTransHuge(page));
3019         }
3020         /*
3021          * Check if our page_cgroup is valid
3022          */
3023         pc = lookup_page_cgroup(page);
3024         if (unlikely(!pc || !PageCgroupUsed(pc)))
3025                 return NULL;
3026
3027         lock_page_cgroup(pc);
3028
3029         memcg = pc->mem_cgroup;
3030
3031         if (!PageCgroupUsed(pc))
3032                 goto unlock_out;
3033
3034         switch (ctype) {
3035         case MEM_CGROUP_CHARGE_TYPE_MAPPED:
3036         case MEM_CGROUP_CHARGE_TYPE_DROP:
3037                 /* See mem_cgroup_prepare_migration() */
3038                 if (page_mapped(page) || PageCgroupMigration(pc))
3039                         goto unlock_out;
3040                 break;
3041         case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
3042                 if (!PageAnon(page)) {  /* Shared memory */
3043                         if (page->mapping && !page_is_file_cache(page))
3044                                 goto unlock_out;
3045                 } else if (page_mapped(page)) /* Anon */
3046                                 goto unlock_out;
3047                 break;
3048         default:
3049                 break;
3050         }
3051
3052         mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages);
3053
3054         ClearPageCgroupUsed(pc);
3055         /*
3056          * pc->mem_cgroup is not cleared here. It will be accessed when it's
3057          * freed from LRU. This is safe because uncharged page is expected not
3058          * to be reused (freed soon). Exception is SwapCache, it's handled by
3059          * special functions.
3060          */
3061
3062         unlock_page_cgroup(pc);
3063         /*
3064          * even after unlock, we have memcg->res.usage here and this memcg
3065          * will never be freed.
3066          */
3067         memcg_check_events(memcg, page);
3068         if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
3069                 mem_cgroup_swap_statistics(memcg, true);
3070                 mem_cgroup_get(memcg);
3071         }
3072         if (!mem_cgroup_is_root(memcg))
3073                 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
3074
3075         return memcg;
3076
3077 unlock_out:
3078         unlock_page_cgroup(pc);
3079         return NULL;
3080 }
3081
3082 void mem_cgroup_uncharge_page(struct page *page)
3083 {
3084         /* early check. */
3085         if (page_mapped(page))
3086                 return;
3087         if (page->mapping && !PageAnon(page))
3088                 return;
3089         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
3090 }
3091
3092 void mem_cgroup_uncharge_cache_page(struct page *page)
3093 {
3094         VM_BUG_ON(page_mapped(page));
3095         VM_BUG_ON(page->mapping);
3096         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
3097 }
3098
3099 /*
3100  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
3101  * In that cases, pages are freed continuously and we can expect pages
3102  * are in the same memcg. All these calls itself limits the number of
3103  * pages freed at once, then uncharge_start/end() is called properly.
3104  * This may be called prural(2) times in a context,
3105  */
3106
3107 void mem_cgroup_uncharge_start(void)
3108 {
3109         current->memcg_batch.do_batch++;
3110         /* We can do nest. */
3111         if (current->memcg_batch.do_batch == 1) {
3112                 current->memcg_batch.memcg = NULL;
3113                 current->memcg_batch.nr_pages = 0;
3114                 current->memcg_batch.memsw_nr_pages = 0;
3115         }
3116 }
3117
3118 void mem_cgroup_uncharge_end(void)
3119 {
3120         struct memcg_batch_info *batch = &current->memcg_batch;
3121
3122         if (!batch->do_batch)
3123                 return;
3124
3125         batch->do_batch--;
3126         if (batch->do_batch) /* If stacked, do nothing. */
3127                 return;
3128
3129         if (!batch->memcg)
3130                 return;
3131         /*
3132          * This "batch->memcg" is valid without any css_get/put etc...
3133          * bacause we hide charges behind us.
3134          */
3135         if (batch->nr_pages)
3136                 res_counter_uncharge(&batch->memcg->res,
3137                                      batch->nr_pages * PAGE_SIZE);
3138         if (batch->memsw_nr_pages)
3139                 res_counter_uncharge(&batch->memcg->memsw,
3140                                      batch->memsw_nr_pages * PAGE_SIZE);
3141         memcg_oom_recover(batch->memcg);
3142         /* forget this pointer (for sanity check) */
3143         batch->memcg = NULL;
3144 }
3145
3146 #ifdef CONFIG_SWAP
3147 /*
3148  * called after __delete_from_swap_cache() and drop "page" account.
3149  * memcg information is recorded to swap_cgroup of "ent"
3150  */
3151 void
3152 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3153 {
3154         struct mem_cgroup *memcg;
3155         int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
3156
3157         if (!swapout) /* this was a swap cache but the swap is unused ! */
3158                 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3159
3160         memcg = __mem_cgroup_uncharge_common(page, ctype);
3161
3162         /*
3163          * record memcg information,  if swapout && memcg != NULL,
3164          * mem_cgroup_get() was called in uncharge().
3165          */
3166         if (do_swap_account && swapout && memcg)
3167                 swap_cgroup_record(ent, css_id(&memcg->css));
3168 }
3169 #endif
3170
3171 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3172 /*
3173  * called from swap_entry_free(). remove record in swap_cgroup and
3174  * uncharge "memsw" account.
3175  */
3176 void mem_cgroup_uncharge_swap(swp_entry_t ent)
3177 {
3178         struct mem_cgroup *memcg;
3179         unsigned short id;
3180
3181         if (!do_swap_account)
3182                 return;
3183
3184         id = swap_cgroup_record(ent, 0);
3185         rcu_read_lock();
3186         memcg = mem_cgroup_lookup(id);
3187         if (memcg) {
3188                 /*
3189                  * We uncharge this because swap is freed.
3190                  * This memcg can be obsolete one. We avoid calling css_tryget
3191                  */
3192                 if (!mem_cgroup_is_root(memcg))
3193                         res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
3194                 mem_cgroup_swap_statistics(memcg, false);
3195                 mem_cgroup_put(memcg);
3196         }
3197         rcu_read_unlock();
3198 }
3199
3200 /**
3201  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
3202  * @entry: swap entry to be moved
3203  * @from:  mem_cgroup which the entry is moved from
3204  * @to:  mem_cgroup which the entry is moved to
3205  * @need_fixup: whether we should fixup res_counters and refcounts.
3206  *
3207  * It succeeds only when the swap_cgroup's record for this entry is the same
3208  * as the mem_cgroup's id of @from.
3209  *
3210  * Returns 0 on success, -EINVAL on failure.
3211  *
3212  * The caller must have charged to @to, IOW, called res_counter_charge() about
3213  * both res and memsw, and called css_get().
3214  */
3215 static int mem_cgroup_move_swap_account(swp_entry_t entry,
3216                 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
3217 {
3218         unsigned short old_id, new_id;
3219
3220         old_id = css_id(&from->css);
3221         new_id = css_id(&to->css);
3222
3223         if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3224                 mem_cgroup_swap_statistics(from, false);
3225                 mem_cgroup_swap_statistics(to, true);
3226                 /*
3227                  * This function is only called from task migration context now.
3228                  * It postpones res_counter and refcount handling till the end
3229                  * of task migration(mem_cgroup_clear_mc()) for performance
3230                  * improvement. But we cannot postpone mem_cgroup_get(to)
3231                  * because if the process that has been moved to @to does
3232                  * swap-in, the refcount of @to might be decreased to 0.
3233                  */
3234                 mem_cgroup_get(to);
3235                 if (need_fixup) {
3236                         if (!mem_cgroup_is_root(from))
3237                                 res_counter_uncharge(&from->memsw, PAGE_SIZE);
3238                         mem_cgroup_put(from);
3239                         /*
3240                          * we charged both to->res and to->memsw, so we should
3241                          * uncharge to->res.
3242                          */
3243                         if (!mem_cgroup_is_root(to))
3244                                 res_counter_uncharge(&to->res, PAGE_SIZE);
3245                 }
3246                 return 0;
3247         }
3248         return -EINVAL;
3249 }
3250 #else
3251 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3252                 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
3253 {
3254         return -EINVAL;
3255 }
3256 #endif
3257
3258 /*
3259  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
3260  * page belongs to.
3261  */
3262 int mem_cgroup_prepare_migration(struct page *page,
3263         struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
3264 {
3265         struct mem_cgroup *memcg = NULL;
3266         struct page_cgroup *pc;
3267         enum charge_type ctype;
3268         int ret = 0;
3269
3270         *ptr = NULL;
3271
3272         VM_BUG_ON(PageTransHuge(page));
3273         if (mem_cgroup_disabled())
3274                 return 0;
3275
3276         pc = lookup_page_cgroup(page);
3277         lock_page_cgroup(pc);
3278         if (PageCgroupUsed(pc)) {
3279                 memcg = pc->mem_cgroup;
3280                 css_get(&memcg->css);
3281                 /*
3282                  * At migrating an anonymous page, its mapcount goes down
3283                  * to 0 and uncharge() will be called. But, even if it's fully
3284                  * unmapped, migration may fail and this page has to be
3285                  * charged again. We set MIGRATION flag here and delay uncharge
3286                  * until end_migration() is called
3287                  *
3288                  * Corner Case Thinking
3289                  * A)
3290                  * When the old page was mapped as Anon and it's unmap-and-freed
3291                  * while migration was ongoing.
3292                  * If unmap finds the old page, uncharge() of it will be delayed
3293                  * until end_migration(). If unmap finds a new page, it's
3294                  * uncharged when it make mapcount to be 1->0. If unmap code
3295                  * finds swap_migration_entry, the new page will not be mapped
3296                  * and end_migration() will find it(mapcount==0).
3297                  *
3298                  * B)
3299                  * When the old page was mapped but migraion fails, the kernel
3300                  * remaps it. A charge for it is kept by MIGRATION flag even
3301                  * if mapcount goes down to 0. We can do remap successfully
3302                  * without charging it again.
3303                  *
3304                  * C)
3305                  * The "old" page is under lock_page() until the end of
3306                  * migration, so, the old page itself will not be swapped-out.
3307                  * If the new page is swapped out before end_migraton, our
3308                  * hook to usual swap-out path will catch the event.
3309                  */
3310                 if (PageAnon(page))
3311                         SetPageCgroupMigration(pc);
3312         }
3313         unlock_page_cgroup(pc);
3314         /*
3315          * If the page is not charged at this point,
3316          * we return here.
3317          */
3318         if (!memcg)
3319                 return 0;
3320
3321         *ptr = memcg;
3322         ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
3323         css_put(&memcg->css);/* drop extra refcnt */
3324         if (ret || *ptr == NULL) {
3325                 if (PageAnon(page)) {
3326                         lock_page_cgroup(pc);
3327                         ClearPageCgroupMigration(pc);
3328                         unlock_page_cgroup(pc);
3329                         /*
3330                          * The old page may be fully unmapped while we kept it.
3331                          */
3332                         mem_cgroup_uncharge_page(page);
3333                 }
3334                 return -ENOMEM;
3335         }
3336         /*
3337          * We charge new page before it's used/mapped. So, even if unlock_page()
3338          * is called before end_migration, we can catch all events on this new
3339          * page. In the case new page is migrated but not remapped, new page's
3340          * mapcount will be finally 0 and we call uncharge in end_migration().
3341          */
3342         pc = lookup_page_cgroup(newpage);
3343         if (PageAnon(page))
3344                 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
3345         else if (page_is_file_cache(page))
3346                 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3347         else
3348                 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3349         __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
3350         return ret;
3351 }
3352
3353 /* remove redundant charge if migration failed*/
3354 void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3355         struct page *oldpage, struct page *newpage, bool migration_ok)
3356 {
3357         struct page *used, *unused;
3358         struct page_cgroup *pc;
3359
3360         if (!memcg)
3361                 return;
3362         /* blocks rmdir() */
3363         cgroup_exclude_rmdir(&memcg->css);
3364         if (!migration_ok) {
3365                 used = oldpage;
3366                 unused = newpage;
3367         } else {
3368                 used = newpage;
3369                 unused = oldpage;
3370         }
3371         /*
3372          * We disallowed uncharge of pages under migration because mapcount
3373          * of the page goes down to zero, temporarly.
3374          * Clear the flag and check the page should be charged.
3375          */
3376         pc = lookup_page_cgroup(oldpage);
3377         lock_page_cgroup(pc);
3378         ClearPageCgroupMigration(pc);
3379         unlock_page_cgroup(pc);
3380
3381         __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
3382
3383         /*
3384          * If a page is a file cache, radix-tree replacement is very atomic
3385          * and we can skip this check. When it was an Anon page, its mapcount
3386          * goes down to 0. But because we added MIGRATION flage, it's not
3387          * uncharged yet. There are several case but page->mapcount check
3388          * and USED bit check in mem_cgroup_uncharge_page() will do enough
3389          * check. (see prepare_charge() also)
3390          */
3391         if (PageAnon(used))
3392                 mem_cgroup_uncharge_page(used);
3393         /*
3394          * At migration, we may charge account against cgroup which has no
3395          * tasks.
3396          * So, rmdir()->pre_destroy() can be called while we do this charge.
3397          * In that case, we need to call pre_destroy() again. check it here.
3398          */
3399         cgroup_release_and_wakeup_rmdir(&memcg->css);
3400 }
3401
3402 /*
3403  * At replace page cache, newpage is not under any memcg but it's on
3404  * LRU. So, this function doesn't touch res_counter but handles LRU
3405  * in correct way. Both pages are locked so we cannot race with uncharge.
3406  */
3407 void mem_cgroup_replace_page_cache(struct page *oldpage,
3408                                   struct page *newpage)
3409 {
3410         struct mem_cgroup *memcg;
3411         struct page_cgroup *pc;
3412         struct zone *zone;
3413         enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3414         unsigned long flags;
3415
3416         if (mem_cgroup_disabled())
3417                 return;
3418
3419         pc = lookup_page_cgroup(oldpage);
3420         /* fix accounting on old pages */
3421         lock_page_cgroup(pc);
3422         memcg = pc->mem_cgroup;
3423         mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
3424         ClearPageCgroupUsed(pc);
3425         unlock_page_cgroup(pc);
3426
3427         if (PageSwapBacked(oldpage))
3428                 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3429
3430         zone = page_zone(newpage);
3431         pc = lookup_page_cgroup(newpage);
3432         /*
3433          * Even if newpage->mapping was NULL before starting replacement,
3434          * the newpage may be on LRU(or pagevec for LRU) already. We lock
3435          * LRU while we overwrite pc->mem_cgroup.
3436          */
3437         spin_lock_irqsave(&zone->lru_lock, flags);
3438         if (PageLRU(newpage))
3439                 del_page_from_lru_list(zone, newpage, page_lru(newpage));
3440         __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type);
3441         if (PageLRU(newpage))
3442                 add_page_to_lru_list(zone, newpage, page_lru(newpage));
3443         spin_unlock_irqrestore(&zone->lru_lock, flags);
3444 }
3445
3446 #ifdef CONFIG_DEBUG_VM
3447 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3448 {
3449         struct page_cgroup *pc;
3450
3451         pc = lookup_page_cgroup(page);
3452         if (likely(pc) && PageCgroupUsed(pc))
3453                 return pc;
3454         return NULL;
3455 }
3456
3457 bool mem_cgroup_bad_page_check(struct page *page)
3458 {
3459         if (mem_cgroup_disabled())
3460                 return false;
3461
3462         return lookup_page_cgroup_used(page) != NULL;
3463 }
3464
3465 void mem_cgroup_print_bad_page(struct page *page)
3466 {
3467         struct page_cgroup *pc;
3468
3469         pc = lookup_page_cgroup_used(page);
3470         if (pc) {
3471                 int ret = -1;
3472                 char *path;
3473
3474                 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
3475                        pc, pc->flags, pc->mem_cgroup);
3476
3477                 path = kmalloc(PATH_MAX, GFP_KERNEL);
3478                 if (path) {
3479                         rcu_read_lock();
3480                         ret = cgroup_path(pc->mem_cgroup->css.cgroup,
3481                                                         path, PATH_MAX);
3482                         rcu_read_unlock();
3483                 }
3484
3485                 printk(KERN_CONT "(%s)\n",
3486                                 (ret < 0) ? "cannot get the path" : path);
3487                 kfree(path);
3488         }
3489 }
3490 #endif
3491
3492 static DEFINE_MUTEX(set_limit_mutex);
3493
3494 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3495                                 unsigned long long val)
3496 {
3497         int retry_count;
3498         u64 memswlimit, memlimit;
3499         int ret = 0;
3500         int children = mem_cgroup_count_children(memcg);
3501         u64 curusage, oldusage;
3502         int enlarge;
3503
3504         /*
3505          * For keeping hierarchical_reclaim simple, how long we should retry
3506          * is depends on callers. We set our retry-count to be function
3507          * of # of children which we should visit in this loop.
3508          */
3509         retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
3510
3511         oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3512
3513         enlarge = 0;
3514         while (retry_count) {
3515                 if (signal_pending(current)) {
3516                         ret = -EINTR;
3517                         break;
3518                 }
3519                 /*
3520                  * Rather than hide all in some function, I do this in
3521                  * open coded manner. You see what this really does.
3522                  * We have to guarantee memcg->res.limit < memcg->memsw.limit.
3523                  */
3524                 mutex_lock(&set_limit_mutex);
3525                 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3526                 if (memswlimit < val) {
3527                         ret = -EINVAL;
3528                         mutex_unlock(&set_limit_mutex);
3529                         break;
3530                 }
3531
3532                 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3533                 if (memlimit < val)
3534                         enlarge = 1;
3535
3536                 ret = res_counter_set_limit(&memcg->res, val);
3537                 if (!ret) {
3538                         if (memswlimit == val)
3539                                 memcg->memsw_is_minimum = true;
3540                         else
3541                                 memcg->memsw_is_minimum = false;
3542                 }
3543                 mutex_unlock(&set_limit_mutex);
3544
3545                 if (!ret)
3546                         break;
3547
3548                 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3549                                    MEM_CGROUP_RECLAIM_SHRINK);
3550                 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3551                 /* Usage is reduced ? */
3552                 if (curusage >= oldusage)
3553                         retry_count--;
3554                 else
3555                         oldusage = curusage;
3556         }
3557         if (!ret && enlarge)
3558                 memcg_oom_recover(memcg);
3559
3560         return ret;
3561 }
3562
3563 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3564                                         unsigned long long val)
3565 {
3566         int retry_count;
3567         u64 memlimit, memswlimit, oldusage, curusage;
3568         int children = mem_cgroup_count_children(memcg);
3569         int ret = -EBUSY;
3570         int enlarge = 0;
3571
3572         /* see mem_cgroup_resize_res_limit */
3573         retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
3574         oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3575         while (retry_count) {
3576                 if (signal_pending(current)) {
3577                         ret = -EINTR;
3578                         break;
3579                 }
3580                 /*
3581                  * Rather than hide all in some function, I do this in
3582                  * open coded manner. You see what this really does.
3583                  * We have to guarantee memcg->res.limit < memcg->memsw.limit.
3584                  */
3585                 mutex_lock(&set_limit_mutex);
3586                 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3587                 if (memlimit > val) {
3588                         ret = -EINVAL;
3589                         mutex_unlock(&set_limit_mutex);
3590                         break;
3591                 }
3592                 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3593                 if (memswlimit < val)
3594                         enlarge = 1;
3595                 ret = res_counter_set_limit(&memcg->memsw, val);
3596                 if (!ret) {
3597                         if (memlimit == val)
3598                                 memcg->memsw_is_minimum = true;
3599                         else
3600                                 memcg->memsw_is_minimum = false;
3601                 }
3602                 mutex_unlock(&set_limit_mutex);
3603
3604                 if (!ret)
3605                         break;
3606
3607                 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3608                                    MEM_CGROUP_RECLAIM_NOSWAP |
3609                                    MEM_CGROUP_RECLAIM_SHRINK);
3610                 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3611                 /* Usage is reduced ? */
3612                 if (curusage >= oldusage)
3613                         retry_count--;
3614                 else
3615                         oldusage = curusage;
3616         }
3617         if (!ret && enlarge)
3618                 memcg_oom_recover(memcg);
3619         return ret;
3620 }
3621
3622 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3623                                             gfp_t gfp_mask,
3624                                             unsigned long *total_scanned)
3625 {
3626         unsigned long nr_reclaimed = 0;
3627         struct mem_cgroup_per_zone *mz, *next_mz = NULL;
3628         unsigned long reclaimed;
3629         int loop = 0;
3630         struct mem_cgroup_tree_per_zone *mctz;
3631         unsigned long long excess;
3632         unsigned long nr_scanned;
3633
3634         if (order > 0)
3635                 return 0;
3636
3637         mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3638         /*
3639          * This loop can run a while, specially if mem_cgroup's continuously
3640          * keep exceeding their soft limit and putting the system under
3641          * pressure
3642          */
3643         do {
3644                 if (next_mz)
3645                         mz = next_mz;
3646                 else
3647                         mz = mem_cgroup_largest_soft_limit_node(mctz);
3648                 if (!mz)
3649                         break;
3650
3651                 nr_scanned = 0;
3652                 reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone,
3653                                                     gfp_mask, &nr_scanned);
3654                 nr_reclaimed += reclaimed;
3655                 *total_scanned += nr_scanned;
3656                 spin_lock(&mctz->lock);
3657
3658                 /*
3659                  * If we failed to reclaim anything from this memory cgroup
3660                  * it is time to move on to the next cgroup
3661                  */
3662                 next_mz = NULL;
3663                 if (!reclaimed) {
3664                         do {
3665                                 /*
3666                                  * Loop until we find yet another one.
3667                                  *
3668                                  * By the time we get the soft_limit lock
3669                                  * again, someone might have aded the
3670                                  * group back on the RB tree. Iterate to
3671                                  * make sure we get a different mem.
3672                                  * mem_cgroup_largest_soft_limit_node returns
3673                                  * NULL if no other cgroup is present on
3674                                  * the tree
3675                                  */
3676                                 next_mz =
3677                                 __mem_cgroup_largest_soft_limit_node(mctz);
3678                                 if (next_mz == mz)
3679                                         css_put(&next_mz->mem->css);
3680                                 else /* next_mz == NULL or other memcg */
3681                                         break;
3682                         } while (1);
3683                 }
3684                 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
3685                 excess = res_counter_soft_limit_excess(&mz->mem->res);
3686                 /*
3687                  * One school of thought says that we should not add
3688                  * back the node to the tree if reclaim returns 0.
3689                  * But our reclaim could return 0, simply because due
3690                  * to priority we are exposing a smaller subset of
3691                  * memory to reclaim from. Consider this as a longer
3692                  * term TODO.
3693                  */
3694                 /* If excess == 0, no tree ops */
3695                 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
3696                 spin_unlock(&mctz->lock);
3697                 css_put(&mz->mem->css);
3698                 loop++;
3699                 /*
3700                  * Could not reclaim anything and there are no more
3701                  * mem cgroups to try or we seem to be looping without
3702                  * reclaiming anything.
3703                  */
3704                 if (!nr_reclaimed &&
3705                         (next_mz == NULL ||
3706                         loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3707                         break;
3708         } while (!nr_reclaimed);
3709         if (next_mz)
3710                 css_put(&next_mz->mem->css);
3711         return nr_reclaimed;
3712 }
3713
3714 /*
3715  * This routine traverse page_cgroup in given list and drop them all.
3716  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
3717  */
3718 static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3719                                 int node, int zid, enum lru_list lru)
3720 {
3721         struct mem_cgroup_per_zone *mz;
3722         unsigned long flags, loop;
3723         struct list_head *list;
3724         struct page *busy;
3725         struct zone *zone;
3726         int ret = 0;
3727
3728         zone = &NODE_DATA(node)->node_zones[zid];
3729         mz = mem_cgroup_zoneinfo(memcg, node, zid);
3730         list = &mz->lruvec.lists[lru];
3731
3732         loop = MEM_CGROUP_ZSTAT(mz, lru);
3733         /* give some margin against EBUSY etc...*/
3734         loop += 256;
3735         busy = NULL;
3736         while (loop--) {
3737                 struct page_cgroup *pc;
3738                 struct page *page;
3739
3740                 ret = 0;
3741                 spin_lock_irqsave(&zone->lru_lock, flags);
3742                 if (list_empty(list)) {
3743                         spin_unlock_irqrestore(&zone->lru_lock, flags);
3744                         break;
3745                 }
3746                 page = list_entry(list->prev, struct page, lru);
3747                 if (busy == page) {
3748                         list_move(&page->lru, list);
3749                         busy = NULL;
3750                         spin_unlock_irqrestore(&zone->lru_lock, flags);
3751                         continue;
3752                 }
3753                 spin_unlock_irqrestore(&zone->lru_lock, flags);
3754
3755                 pc = lookup_page_cgroup(page);
3756
3757                 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
3758                 if (ret == -ENOMEM)
3759                         break;
3760
3761                 if (ret == -EBUSY || ret == -EINVAL) {
3762                         /* found lock contention or "pc" is obsolete. */
3763                         busy = page;
3764                         cond_resched();
3765                 } else
3766                         busy = NULL;
3767         }
3768
3769         if (!ret && !list_empty(list))
3770                 return -EBUSY;
3771         return ret;
3772 }
3773
3774 /*
3775  * make mem_cgroup's charge to be 0 if there is no task.
3776  * This enables deleting this mem_cgroup.
3777  */
3778 static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
3779 {
3780         int ret;
3781         int node, zid, shrink;
3782         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3783         struct cgroup *cgrp = memcg->css.cgroup;
3784
3785         css_get(&memcg->css);
3786
3787         shrink = 0;
3788         /* should free all ? */
3789         if (free_all)
3790                 goto try_to_free;
3791 move_account:
3792         do {
3793                 ret = -EBUSY;
3794                 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3795                         goto out;
3796                 ret = -EINTR;
3797                 if (signal_pending(current))
3798                         goto out;
3799                 /* This is for making all *used* pages to be on LRU. */
3800                 lru_add_drain_all();
3801                 drain_all_stock_sync(memcg);
3802                 ret = 0;
3803                 mem_cgroup_start_move(memcg);
3804                 for_each_node_state(node, N_HIGH_MEMORY) {
3805                         for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3806                                 enum lru_list l;
3807                                 for_each_lru(l) {
3808                                         ret = mem_cgroup_force_empty_list(memcg,
3809                                                         node, zid, l);
3810                                         if (ret)
3811                                                 break;
3812                                 }
3813                         }
3814                         if (ret)
3815                                 break;
3816                 }
3817                 mem_cgroup_end_move(memcg);
3818                 memcg_oom_recover(memcg);
3819                 /* it seems parent cgroup doesn't have enough mem */
3820                 if (ret == -ENOMEM)
3821                         goto try_to_free;
3822                 cond_resched();
3823         /* "ret" should also be checked to ensure all lists are empty. */
3824         } while (memcg->res.usage > 0 || ret);
3825 out:
3826         css_put(&memcg->css);
3827         return ret;
3828
3829 try_to_free:
3830         /* returns EBUSY if there is a task or if we come here twice. */
3831         if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
3832                 ret = -EBUSY;
3833                 goto out;
3834         }
3835         /* we call try-to-free pages for make this cgroup empty */
3836         lru_add_drain_all();
3837         /* try to free all pages in this cgroup */
3838         shrink = 1;
3839         while (nr_retries && memcg->res.usage > 0) {
3840                 int progress;
3841
3842                 if (signal_pending(current)) {
3843                         ret = -EINTR;
3844                         goto out;
3845                 }
3846                 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
3847                                                 false);
3848                 if (!progress) {
3849                         nr_retries--;
3850                         /* maybe some writeback is necessary */
3851                         congestion_wait(BLK_RW_ASYNC, HZ/10);
3852                 }
3853
3854         }
3855         lru_add_drain();
3856         /* try move_account...there may be some *locked* pages. */
3857         goto move_account;
3858 }
3859
3860 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3861 {
3862         return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3863 }
3864
3865
3866 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3867 {
3868         return mem_cgroup_from_cont(cont)->use_hierarchy;
3869 }
3870
3871 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3872                                         u64 val)
3873 {
3874         int retval = 0;
3875         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3876         struct cgroup *parent = cont->parent;
3877         struct mem_cgroup *parent_memcg = NULL;
3878
3879         if (parent)
3880                 parent_memcg = mem_cgroup_from_cont(parent);
3881
3882         cgroup_lock();
3883         /*
3884          * If parent's use_hierarchy is set, we can't make any modifications
3885          * in the child subtrees. If it is unset, then the change can
3886          * occur, provided the current cgroup has no children.
3887          *
3888          * For the root cgroup, parent_mem is NULL, we allow value to be
3889          * set if there are no children.
3890          */
3891         if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3892                                 (val == 1 || val == 0)) {
3893                 if (list_empty(&cont->children))
3894                         memcg->use_hierarchy = val;
3895                 else
3896                         retval = -EBUSY;
3897         } else
3898                 retval = -EINVAL;
3899         cgroup_unlock();
3900
3901         return retval;
3902 }
3903
3904
3905 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
3906                                                enum mem_cgroup_stat_index idx)
3907 {
3908         struct mem_cgroup *iter;
3909         long val = 0;
3910
3911         /* Per-cpu values can be negative, use a signed accumulator */
3912         for_each_mem_cgroup_tree(iter, memcg)
3913                 val += mem_cgroup_read_stat(iter, idx);
3914
3915         if (val < 0) /* race ? */
3916                 val = 0;
3917         return val;
3918 }
3919
3920 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3921 {
3922         u64 val;
3923
3924         if (!mem_cgroup_is_root(memcg)) {
3925                 val = 0;
3926 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
3927                 if (!memcg->kmem_independent_accounting)
3928                         val = res_counter_read_u64(&memcg->kmem, RES_USAGE);
3929 #endif
3930                 if (!swap)
3931                         val += res_counter_read_u64(&memcg->res, RES_USAGE);
3932                 else
3933                         val += res_counter_read_u64(&memcg->memsw, RES_USAGE);
3934
3935                 return val;
3936         }
3937
3938         val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
3939         val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
3940
3941         if (swap)
3942                 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
3943
3944         return val << PAGE_SHIFT;
3945 }
3946
3947 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3948 {
3949         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3950         u64 val;
3951         int type, name;
3952
3953         type = MEMFILE_TYPE(cft->private);
3954         name = MEMFILE_ATTR(cft->private);
3955         switch (type) {
3956         case _MEM:
3957                 if (name == RES_USAGE)
3958                         val = mem_cgroup_usage(memcg, false);
3959                 else
3960                         val = res_counter_read_u64(&memcg->res, name);
3961                 break;
3962         case _MEMSWAP:
3963                 if (name == RES_USAGE)
3964                         val = mem_cgroup_usage(memcg, true);
3965                 else
3966                         val = res_counter_read_u64(&memcg->memsw, name);
3967                 break;
3968 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
3969         case _KMEM:
3970                 val = res_counter_read_u64(&memcg->kmem, name);
3971                 break;
3972 #endif
3973         default:
3974                 BUG();
3975                 break;
3976         }
3977         return val;
3978 }
3979 /*
3980  * The user of this function is...
3981  * RES_LIMIT.
3982  */
3983 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3984                             const char *buffer)
3985 {
3986         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3987         int type, name;
3988         unsigned long long val;
3989         int ret;
3990
3991         type = MEMFILE_TYPE(cft->private);
3992         name = MEMFILE_ATTR(cft->private);
3993         switch (name) {
3994         case RES_LIMIT:
3995                 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3996                         ret = -EINVAL;
3997                         break;
3998                 }
3999                 /* This function does all necessary parse...reuse it */
4000                 ret = res_counter_memparse_write_strategy(buffer, &val);
4001                 if (ret)
4002                         break;
4003                 if (type == _MEM)
4004                         ret = mem_cgroup_resize_limit(memcg, val);
4005                 else
4006                         ret = mem_cgroup_resize_memsw_limit(memcg, val);
4007                 break;
4008         case RES_SOFT_LIMIT:
4009                 ret = res_counter_memparse_write_strategy(buffer, &val);
4010                 if (ret)
4011                         break;
4012                 /*
4013                  * For memsw, soft limits are hard to implement in terms
4014                  * of semantics, for now, we support soft limits for
4015                  * control without swap
4016                  */
4017                 if (type == _MEM)
4018                         ret = res_counter_set_soft_limit(&memcg->res, val);
4019                 else
4020                         ret = -EINVAL;
4021                 break;
4022         default:
4023                 ret = -EINVAL; /* should be BUG() ? */
4024                 break;
4025         }
4026         return ret;
4027 }
4028
4029 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
4030                 unsigned long long *mem_limit, unsigned long long *memsw_limit)
4031 {
4032         struct cgroup *cgroup;
4033         unsigned long long min_limit, min_memsw_limit, tmp;
4034
4035         min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4036         min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4037         cgroup = memcg->css.cgroup;
4038         if (!memcg->use_hierarchy)
4039                 goto out;
4040
4041         while (cgroup->parent) {
4042                 cgroup = cgroup->parent;
4043                 memcg = mem_cgroup_from_cont(cgroup);
4044                 if (!memcg->use_hierarchy)
4045                         break;
4046                 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
4047                 min_limit = min(min_limit, tmp);
4048                 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4049                 min_memsw_limit = min(min_memsw_limit, tmp);
4050         }
4051 out:
4052         *mem_limit = min_limit;
4053         *memsw_limit = min_memsw_limit;
4054         return;
4055 }
4056
4057 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4058 {
4059         struct mem_cgroup *memcg;
4060         int type, name;
4061
4062         memcg = mem_cgroup_from_cont(cont);
4063         type = MEMFILE_TYPE(event);
4064         name = MEMFILE_ATTR(event);
4065         switch (name) {
4066         case RES_MAX_USAGE:
4067                 if (type == _MEM)
4068                         res_counter_reset_max(&memcg->res);
4069                 else
4070                         res_counter_reset_max(&memcg->memsw);
4071                 break;
4072         case RES_FAILCNT:
4073                 if (type == _MEM)
4074                         res_counter_reset_failcnt(&memcg->res);
4075                 else
4076                         res_counter_reset_failcnt(&memcg->memsw);
4077                 break;
4078         }
4079
4080         return 0;
4081 }
4082
4083 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
4084                                         struct cftype *cft)
4085 {
4086         return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
4087 }
4088
4089 #ifdef CONFIG_MMU
4090 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4091                                         struct cftype *cft, u64 val)
4092 {
4093         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4094
4095         if (val >= (1 << NR_MOVE_TYPE))
4096                 return -EINVAL;
4097         /*
4098          * We check this value several times in both in can_attach() and
4099          * attach(), so we need cgroup lock to prevent this value from being
4100          * inconsistent.
4101          */
4102         cgroup_lock();
4103         memcg->move_charge_at_immigrate = val;
4104         cgroup_unlock();
4105
4106         return 0;
4107 }
4108 #else
4109 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4110                                         struct cftype *cft, u64 val)
4111 {
4112         return -ENOSYS;
4113 }
4114 #endif
4115
4116
4117 /* For read statistics */
4118 enum {
4119         MCS_CACHE,
4120         MCS_RSS,
4121         MCS_FILE_MAPPED,
4122         MCS_PGPGIN,
4123         MCS_PGPGOUT,
4124         MCS_SWAP,
4125         MCS_PGFAULT,
4126         MCS_PGMAJFAULT,
4127         MCS_INACTIVE_ANON,
4128         MCS_ACTIVE_ANON,
4129         MCS_INACTIVE_FILE,
4130         MCS_ACTIVE_FILE,
4131         MCS_UNEVICTABLE,
4132         NR_MCS_STAT,
4133 };
4134
4135 struct mcs_total_stat {
4136         s64 stat[NR_MCS_STAT];
4137 };
4138
4139 struct {
4140         char *local_name;
4141         char *total_name;
4142 } memcg_stat_strings[NR_MCS_STAT] = {
4143         {"cache", "total_cache"},
4144         {"rss", "total_rss"},
4145         {"mapped_file", "total_mapped_file"},
4146         {"pgpgin", "total_pgpgin"},
4147         {"pgpgout", "total_pgpgout"},
4148         {"swap", "total_swap"},
4149         {"pgfault", "total_pgfault"},
4150         {"pgmajfault", "total_pgmajfault"},
4151         {"inactive_anon", "total_inactive_anon"},
4152         {"active_anon", "total_active_anon"},
4153         {"inactive_file", "total_inactive_file"},
4154         {"active_file", "total_active_file"},
4155         {"unevictable", "total_unevictable"}
4156 };
4157
4158
4159 static void
4160 mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4161 {
4162         s64 val;
4163
4164         /* per cpu stat */
4165         val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
4166         s->stat[MCS_CACHE] += val * PAGE_SIZE;
4167         val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
4168         s->stat[MCS_RSS] += val * PAGE_SIZE;
4169         val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
4170         s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
4171         val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
4172         s->stat[MCS_PGPGIN] += val;
4173         val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
4174         s->stat[MCS_PGPGOUT] += val;
4175         if (do_swap_account) {
4176                 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
4177                 s->stat[MCS_SWAP] += val * PAGE_SIZE;
4178         }
4179         val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
4180         s->stat[MCS_PGFAULT] += val;
4181         val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
4182         s->stat[MCS_PGMAJFAULT] += val;
4183
4184         /* per zone stat */
4185         val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
4186         s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
4187         val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
4188         s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
4189         val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
4190         s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
4191         val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
4192         s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
4193         val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4194         s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
4195 }
4196
4197 static void
4198 mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4199 {
4200         struct mem_cgroup *iter;
4201
4202         for_each_mem_cgroup_tree(iter, memcg)
4203                 mem_cgroup_get_local_stat(iter, s);
4204 }
4205
4206 #ifdef CONFIG_NUMA
4207 static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4208 {
4209         int nid;
4210         unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4211         unsigned long node_nr;
4212         struct cgroup *cont = m->private;
4213         struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4214
4215         total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
4216         seq_printf(m, "total=%lu", total_nr);
4217         for_each_node_state(nid, N_HIGH_MEMORY) {
4218                 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
4219                 seq_printf(m, " N%d=%lu", nid, node_nr);
4220         }
4221         seq_putc(m, '\n');
4222
4223         file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
4224         seq_printf(m, "file=%lu", file_nr);
4225         for_each_node_state(nid, N_HIGH_MEMORY) {
4226                 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4227                                 LRU_ALL_FILE);
4228                 seq_printf(m, " N%d=%lu", nid, node_nr);
4229         }
4230         seq_putc(m, '\n');
4231
4232         anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
4233         seq_printf(m, "anon=%lu", anon_nr);
4234         for_each_node_state(nid, N_HIGH_MEMORY) {
4235                 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4236                                 LRU_ALL_ANON);
4237                 seq_printf(m, " N%d=%lu", nid, node_nr);
4238         }
4239         seq_putc(m, '\n');
4240
4241         unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
4242         seq_printf(m, "unevictable=%lu", unevictable_nr);
4243         for_each_node_state(nid, N_HIGH_MEMORY) {
4244                 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4245                                 BIT(LRU_UNEVICTABLE));
4246                 seq_printf(m, " N%d=%lu", nid, node_nr);
4247         }
4248         seq_putc(m, '\n');
4249         return 0;
4250 }
4251 #endif /* CONFIG_NUMA */
4252
4253 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4254                                  struct cgroup_map_cb *cb)
4255 {
4256         struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4257         struct mcs_total_stat mystat;
4258         int i;
4259
4260         memset(&mystat, 0, sizeof(mystat));
4261         mem_cgroup_get_local_stat(mem_cont, &mystat);
4262
4263
4264         for (i = 0; i < NR_MCS_STAT; i++) {
4265                 if (i == MCS_SWAP && !do_swap_account)
4266                         continue;
4267                 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
4268         }
4269
4270         /* Hierarchical information */
4271         {
4272                 unsigned long long limit, memsw_limit;
4273                 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
4274                 cb->fill(cb, "hierarchical_memory_limit", limit);
4275                 if (do_swap_account)
4276                         cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
4277         }
4278
4279         memset(&mystat, 0, sizeof(mystat));
4280         mem_cgroup_get_total_stat(mem_cont, &mystat);
4281         for (i = 0; i < NR_MCS_STAT; i++) {
4282                 if (i == MCS_SWAP && !do_swap_account)
4283                         continue;
4284                 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
4285         }
4286
4287 #ifdef CONFIG_DEBUG_VM
4288         {
4289                 int nid, zid;
4290                 struct mem_cgroup_per_zone *mz;
4291                 unsigned long recent_rotated[2] = {0, 0};
4292                 unsigned long recent_scanned[2] = {0, 0};
4293
4294                 for_each_online_node(nid)
4295                         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4296                                 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
4297
4298                                 recent_rotated[0] +=
4299                                         mz->reclaim_stat.recent_rotated[0];
4300                                 recent_rotated[1] +=
4301                                         mz->reclaim_stat.recent_rotated[1];
4302                                 recent_scanned[0] +=
4303                                         mz->reclaim_stat.recent_scanned[0];
4304                                 recent_scanned[1] +=
4305                                         mz->reclaim_stat.recent_scanned[1];
4306                         }
4307                 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
4308                 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
4309                 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
4310                 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
4311         }
4312 #endif
4313
4314         return 0;
4315 }
4316
4317 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
4318 {
4319         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4320
4321         return mem_cgroup_swappiness(memcg);
4322 }
4323
4324 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
4325                                        u64 val)
4326 {
4327         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4328         struct mem_cgroup *parent;
4329
4330         if (val > 100)
4331                 return -EINVAL;
4332
4333         if (cgrp->parent == NULL)
4334                 return -EINVAL;
4335
4336         parent = mem_cgroup_from_cont(cgrp->parent);
4337
4338         cgroup_lock();
4339
4340         /* If under hierarchy, only empty-root can set this value */
4341         if ((parent->use_hierarchy) ||
4342             (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4343                 cgroup_unlock();
4344                 return -EINVAL;
4345         }
4346
4347         memcg->swappiness = val;
4348
4349         cgroup_unlock();
4350
4351         return 0;
4352 }
4353
4354 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4355 {
4356         struct mem_cgroup_threshold_ary *t;
4357         u64 usage;
4358         int i;
4359
4360         rcu_read_lock();
4361         if (!swap)
4362                 t = rcu_dereference(memcg->thresholds.primary);
4363         else
4364                 t = rcu_dereference(memcg->memsw_thresholds.primary);
4365
4366         if (!t)
4367                 goto unlock;
4368
4369         usage = mem_cgroup_usage(memcg, swap);
4370
4371         /*
4372          * current_threshold points to threshold just below usage.
4373          * If it's not true, a threshold was crossed after last
4374          * call of __mem_cgroup_threshold().
4375          */
4376         i = t->current_threshold;
4377
4378         /*
4379          * Iterate backward over array of thresholds starting from
4380          * current_threshold and check if a threshold is crossed.
4381          * If none of thresholds below usage is crossed, we read
4382          * only one element of the array here.
4383          */
4384         for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4385                 eventfd_signal(t->entries[i].eventfd, 1);
4386
4387         /* i = current_threshold + 1 */
4388         i++;
4389
4390         /*
4391          * Iterate forward over array of thresholds starting from
4392          * current_threshold+1 and check if a threshold is crossed.
4393          * If none of thresholds above usage is crossed, we read
4394          * only one element of the array here.
4395          */
4396         for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4397                 eventfd_signal(t->entries[i].eventfd, 1);
4398
4399         /* Update current_threshold */
4400         t->current_threshold = i - 1;
4401 unlock:
4402         rcu_read_unlock();
4403 }
4404
4405 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4406 {
4407         while (memcg) {
4408                 __mem_cgroup_threshold(memcg, false);
4409                 if (do_swap_account)
4410                         __mem_cgroup_threshold(memcg, true);
4411
4412                 memcg = parent_mem_cgroup(memcg);
4413         }
4414 }
4415
4416 static int compare_thresholds(const void *a, const void *b)
4417 {
4418         const struct mem_cgroup_threshold *_a = a;
4419         const struct mem_cgroup_threshold *_b = b;
4420
4421         return _a->threshold - _b->threshold;
4422 }
4423
4424 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4425 {
4426         struct mem_cgroup_eventfd_list *ev;
4427
4428         list_for_each_entry(ev, &memcg->oom_notify, list)
4429                 eventfd_signal(ev->eventfd, 1);
4430         return 0;
4431 }
4432
4433 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4434 {
4435         struct mem_cgroup *iter;
4436
4437         for_each_mem_cgroup_tree(iter, memcg)
4438                 mem_cgroup_oom_notify_cb(iter);
4439 }
4440
4441 static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4442         struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4443 {
4444         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4445         struct mem_cgroup_thresholds *thresholds;
4446         struct mem_cgroup_threshold_ary *new;
4447         int type = MEMFILE_TYPE(cft->private);
4448         u64 threshold, usage;
4449         int i, size, ret;
4450
4451         ret = res_counter_memparse_write_strategy(args, &threshold);
4452         if (ret)
4453                 return ret;
4454
4455         mutex_lock(&memcg->thresholds_lock);
4456
4457         if (type == _MEM)
4458                 thresholds = &memcg->thresholds;
4459         else if (type == _MEMSWAP)
4460                 thresholds = &memcg->memsw_thresholds;
4461         else
4462                 BUG();
4463
4464         usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4465
4466         /* Check if a threshold crossed before adding a new one */
4467         if (thresholds->primary)
4468                 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4469
4470         size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4471
4472         /* Allocate memory for new array of thresholds */
4473         new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
4474                         GFP_KERNEL);
4475         if (!new) {
4476                 ret = -ENOMEM;
4477                 goto unlock;
4478         }
4479         new->size = size;
4480
4481         /* Copy thresholds (if any) to new array */
4482         if (thresholds->primary) {
4483                 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4484                                 sizeof(struct mem_cgroup_threshold));
4485         }
4486
4487         /* Add new threshold */
4488         new->entries[size - 1].eventfd = eventfd;
4489         new->entries[size - 1].threshold = threshold;
4490
4491         /* Sort thresholds. Registering of new threshold isn't time-critical */
4492         sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4493                         compare_thresholds, NULL);
4494
4495         /* Find current threshold */
4496         new->current_threshold = -1;
4497         for (i = 0; i < size; i++) {
4498                 if (new->entries[i].threshold < usage) {
4499                         /*
4500                          * new->current_threshold will not be used until
4501                          * rcu_assign_pointer(), so it's safe to increment
4502                          * it here.
4503                          */
4504                         ++new->current_threshold;
4505                 }
4506         }
4507
4508         /* Free old spare buffer and save old primary buffer as spare */
4509         kfree(thresholds->spare);
4510         thresholds->spare = thresholds->primary;
4511
4512         rcu_assign_pointer(thresholds->primary, new);
4513
4514         /* To be sure that nobody uses thresholds */
4515         synchronize_rcu();
4516
4517 unlock:
4518         mutex_unlock(&memcg->thresholds_lock);
4519
4520         return ret;
4521 }
4522
4523 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4524         struct cftype *cft, struct eventfd_ctx *eventfd)
4525 {
4526         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4527         struct mem_cgroup_thresholds *thresholds;
4528         struct mem_cgroup_threshold_ary *new;
4529         int type = MEMFILE_TYPE(cft->private);
4530         u64 usage;
4531         int i, j, size;
4532
4533         mutex_lock(&memcg->thresholds_lock);
4534         if (type == _MEM)
4535                 thresholds = &memcg->thresholds;
4536         else if (type == _MEMSWAP)
4537                 thresholds = &memcg->memsw_thresholds;
4538         else
4539                 BUG();
4540
4541         /*
4542          * Something went wrong if we trying to unregister a threshold
4543          * if we don't have thresholds
4544          */
4545         BUG_ON(!thresholds);
4546
4547         usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4548
4549         /* Check if a threshold crossed before removing */
4550         __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4551
4552         /* Calculate new number of threshold */
4553         size = 0;
4554         for (i = 0; i < thresholds->primary->size; i++) {
4555                 if (thresholds->primary->entries[i].eventfd != eventfd)
4556                         size++;
4557         }
4558
4559         new = thresholds->spare;
4560
4561         /* Set thresholds array to NULL if we don't have thresholds */
4562         if (!size) {
4563                 kfree(new);
4564                 new = NULL;
4565                 goto swap_buffers;
4566         }
4567
4568         new->size = size;
4569
4570         /* Copy thresholds and find current threshold */
4571         new->current_threshold = -1;
4572         for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4573                 if (thresholds->primary->entries[i].eventfd == eventfd)
4574                         continue;
4575
4576                 new->entries[j] = thresholds->primary->entries[i];
4577                 if (new->entries[j].threshold < usage) {
4578                         /*
4579                          * new->current_threshold will not be used
4580                          * until rcu_assign_pointer(), so it's safe to increment
4581                          * it here.
4582                          */
4583                         ++new->current_threshold;
4584                 }
4585                 j++;
4586         }
4587
4588 swap_buffers:
4589         /* Swap primary and spare array */
4590         thresholds->spare = thresholds->primary;
4591         rcu_assign_pointer(thresholds->primary, new);
4592
4593         /* To be sure that nobody uses thresholds */
4594         synchronize_rcu();
4595
4596         mutex_unlock(&memcg->thresholds_lock);
4597 }
4598
4599 static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4600         struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4601 {
4602         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4603         struct mem_cgroup_eventfd_list *event;
4604         int type = MEMFILE_TYPE(cft->private);
4605
4606         BUG_ON(type != _OOM_TYPE);
4607         event = kmalloc(sizeof(*event), GFP_KERNEL);
4608         if (!event)
4609                 return -ENOMEM;
4610
4611         spin_lock(&memcg_oom_lock);
4612
4613         event->eventfd = eventfd;
4614         list_add(&event->list, &memcg->oom_notify);
4615
4616         /* already in OOM ? */
4617         if (atomic_read(&memcg->under_oom))
4618                 eventfd_signal(eventfd, 1);
4619         spin_unlock(&memcg_oom_lock);
4620
4621         return 0;
4622 }
4623
4624 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4625         struct cftype *cft, struct eventfd_ctx *eventfd)
4626 {
4627         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4628         struct mem_cgroup_eventfd_list *ev, *tmp;
4629         int type = MEMFILE_TYPE(cft->private);
4630
4631         BUG_ON(type != _OOM_TYPE);
4632
4633         spin_lock(&memcg_oom_lock);
4634
4635         list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4636                 if (ev->eventfd == eventfd) {
4637                         list_del(&ev->list);
4638                         kfree(ev);
4639                 }
4640         }
4641
4642         spin_unlock(&memcg_oom_lock);
4643 }
4644
4645 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4646         struct cftype *cft,  struct cgroup_map_cb *cb)
4647 {
4648         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4649
4650         cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
4651
4652         if (atomic_read(&memcg->under_oom))
4653                 cb->fill(cb, "under_oom", 1);
4654         else
4655                 cb->fill(cb, "under_oom", 0);
4656         return 0;
4657 }
4658
4659 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4660         struct cftype *cft, u64 val)
4661 {
4662         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4663         struct mem_cgroup *parent;
4664
4665         /* cannot set to root cgroup and only 0 and 1 are allowed */
4666         if (!cgrp->parent || !((val == 0) || (val == 1)))
4667                 return -EINVAL;
4668
4669         parent = mem_cgroup_from_cont(cgrp->parent);
4670
4671         cgroup_lock();
4672         /* oom-kill-disable is a flag for subhierarchy. */
4673         if ((parent->use_hierarchy) ||
4674             (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4675                 cgroup_unlock();
4676                 return -EINVAL;
4677         }
4678         memcg->oom_kill_disable = val;
4679         if (!val)
4680                 memcg_oom_recover(memcg);
4681         cgroup_unlock();
4682         return 0;
4683 }
4684
4685 #ifdef CONFIG_NUMA
4686 static const struct file_operations mem_control_numa_stat_file_operations = {
4687         .read = seq_read,
4688         .llseek = seq_lseek,
4689         .release = single_release,
4690 };
4691
4692 static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4693 {
4694         struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
4695
4696         file->f_op = &mem_control_numa_stat_file_operations;
4697         return single_open(file, mem_control_numa_stat_show, cont);
4698 }
4699 #endif /* CONFIG_NUMA */
4700
4701 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
4702 static u64 kmem_limit_independent_read(struct cgroup *cgroup, struct cftype *cft)
4703 {
4704         return mem_cgroup_from_cont(cgroup)->kmem_independent_accounting;
4705 }
4706
4707 static int kmem_limit_independent_write(struct cgroup *cgroup, struct cftype *cft,
4708                                         u64 val)
4709 {
4710         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
4711         struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4712
4713         val = !!val;
4714
4715         /*
4716          * This follows the same hierarchy restrictions than
4717          * mem_cgroup_hierarchy_write()
4718          */
4719         if (!parent || !parent->use_hierarchy) {
4720                 if (list_empty(&cgroup->children))
4721                         memcg->kmem_independent_accounting = val;
4722                 else
4723                         return -EBUSY;
4724         }
4725         else
4726                 return -EINVAL;
4727
4728         return 0;
4729 }
4730 static struct cftype kmem_cgroup_files[] = {
4731         {
4732                 .name = "independent_kmem_limit",
4733                 .read_u64 = kmem_limit_independent_read,
4734                 .write_u64 = kmem_limit_independent_write,
4735         },
4736         {
4737                 .name = "kmem.usage_in_bytes",
4738                 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4739                 .read_u64 = mem_cgroup_read,
4740         },
4741         {
4742                 .name = "kmem.limit_in_bytes",
4743                 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4744                 .read_u64 = mem_cgroup_read,
4745         },
4746 };
4747
4748 static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
4749 {
4750         int ret = 0;
4751
4752         ret = cgroup_add_files(cont, ss, kmem_cgroup_files,
4753                                ARRAY_SIZE(kmem_cgroup_files));
4754
4755         /*
4756          * Part of this would be better living in a separate allocation
4757          * function, leaving us with just the cgroup tree population work.
4758          * We, however, depend on state such as network's proto_list that
4759          * is only initialized after cgroup creation. I found the less
4760          * cumbersome way to deal with it to defer it all to populate time
4761          */
4762         if (!ret)
4763                 ret = mem_cgroup_sockets_init(cont, ss);
4764         return ret;
4765 };
4766
4767 static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
4768                                 struct cgroup *cont)
4769 {
4770         mem_cgroup_sockets_destroy(cont, ss);
4771 }
4772 #else
4773 static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
4774 {
4775         return 0;
4776 }
4777
4778 static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
4779                                 struct cgroup *cont)
4780 {
4781 }
4782 #endif
4783
4784 static struct cftype mem_cgroup_files[] = {
4785         {
4786                 .name = "usage_in_bytes",
4787                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4788                 .read_u64 = mem_cgroup_read,
4789                 .register_event = mem_cgroup_usage_register_event,
4790                 .unregister_event = mem_cgroup_usage_unregister_event,
4791         },
4792         {
4793                 .name = "max_usage_in_bytes",
4794                 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4795                 .trigger = mem_cgroup_reset,
4796                 .read_u64 = mem_cgroup_read,
4797         },
4798         {
4799                 .name = "limit_in_bytes",
4800                 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4801                 .write_string = mem_cgroup_write,
4802                 .read_u64 = mem_cgroup_read,
4803         },
4804         {
4805                 .name = "soft_limit_in_bytes",
4806                 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4807                 .write_string = mem_cgroup_write,
4808                 .read_u64 = mem_cgroup_read,
4809         },
4810         {
4811                 .name = "failcnt",
4812                 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4813                 .trigger = mem_cgroup_reset,
4814                 .read_u64 = mem_cgroup_read,
4815         },
4816         {
4817                 .name = "stat",
4818                 .read_map = mem_control_stat_show,
4819         },
4820         {
4821                 .name = "force_empty",
4822                 .trigger = mem_cgroup_force_empty_write,
4823         },
4824         {
4825                 .name = "use_hierarchy",
4826                 .write_u64 = mem_cgroup_hierarchy_write,
4827                 .read_u64 = mem_cgroup_hierarchy_read,
4828         },
4829         {
4830                 .name = "swappiness",
4831                 .read_u64 = mem_cgroup_swappiness_read,
4832                 .write_u64 = mem_cgroup_swappiness_write,
4833         },
4834         {
4835                 .name = "move_charge_at_immigrate",
4836                 .read_u64 = mem_cgroup_move_charge_read,
4837                 .write_u64 = mem_cgroup_move_charge_write,
4838         },
4839         {
4840                 .name = "oom_control",
4841                 .read_map = mem_cgroup_oom_control_read,
4842                 .write_u64 = mem_cgroup_oom_control_write,
4843                 .register_event = mem_cgroup_oom_register_event,
4844                 .unregister_event = mem_cgroup_oom_unregister_event,
4845                 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4846         },
4847 #ifdef CONFIG_NUMA
4848         {
4849                 .name = "numa_stat",
4850                 .open = mem_control_numa_stat_open,
4851                 .mode = S_IRUGO,
4852         },
4853 #endif
4854 };
4855
4856 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4857 static struct cftype memsw_cgroup_files[] = {
4858         {
4859                 .name = "memsw.usage_in_bytes",
4860                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4861                 .read_u64 = mem_cgroup_read,
4862                 .register_event = mem_cgroup_usage_register_event,
4863                 .unregister_event = mem_cgroup_usage_unregister_event,
4864         },
4865         {
4866                 .name = "memsw.max_usage_in_bytes",
4867                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4868                 .trigger = mem_cgroup_reset,
4869                 .read_u64 = mem_cgroup_read,
4870         },
4871         {
4872                 .name = "memsw.limit_in_bytes",
4873                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4874                 .write_string = mem_cgroup_write,
4875                 .read_u64 = mem_cgroup_read,
4876         },
4877         {
4878                 .name = "memsw.failcnt",
4879                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4880                 .trigger = mem_cgroup_reset,
4881                 .read_u64 = mem_cgroup_read,
4882         },
4883 };
4884
4885 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4886 {
4887         if (!do_swap_account)
4888                 return 0;
4889         return cgroup_add_files(cont, ss, memsw_cgroup_files,
4890                                 ARRAY_SIZE(memsw_cgroup_files));
4891 };
4892 #else
4893 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4894 {
4895         return 0;
4896 }
4897 #endif
4898
4899 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4900 {
4901         struct mem_cgroup_per_node *pn;
4902         struct mem_cgroup_per_zone *mz;
4903         enum lru_list l;
4904         int zone, tmp = node;
4905         /*
4906          * This routine is called against possible nodes.
4907          * But it's BUG to call kmalloc() against offline node.
4908          *
4909          * TODO: this routine can waste much memory for nodes which will
4910          *       never be onlined. It's better to use memory hotplug callback
4911          *       function.
4912          */
4913         if (!node_state(node, N_NORMAL_MEMORY))
4914                 tmp = -1;
4915         pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4916         if (!pn)
4917                 return 1;
4918
4919         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4920                 mz = &pn->zoneinfo[zone];
4921                 for_each_lru(l)
4922                         INIT_LIST_HEAD(&mz->lruvec.lists[l]);
4923                 mz->usage_in_excess = 0;
4924                 mz->on_tree = false;
4925                 mz->mem = memcg;
4926         }
4927         memcg->info.nodeinfo[node] = pn;
4928         return 0;
4929 }
4930
4931 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4932 {
4933         kfree(memcg->info.nodeinfo[node]);
4934 }
4935
4936 static struct mem_cgroup *mem_cgroup_alloc(void)
4937 {
4938         struct mem_cgroup *mem;
4939         int size = sizeof(struct mem_cgroup);
4940
4941         /* Can be very big if MAX_NUMNODES is very big */
4942         if (size < PAGE_SIZE)
4943                 mem = kzalloc(size, GFP_KERNEL);
4944         else
4945                 mem = vzalloc(size);
4946
4947         if (!mem)
4948                 return NULL;
4949
4950         mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4951         if (!mem->stat)
4952                 goto out_free;
4953         spin_lock_init(&mem->pcp_counter_lock);
4954         return mem;
4955
4956 out_free:
4957         if (size < PAGE_SIZE)
4958                 kfree(mem);
4959         else
4960                 vfree(mem);
4961         return NULL;
4962 }
4963
4964 /*
4965  * At destroying mem_cgroup, references from swap_cgroup can remain.
4966  * (scanning all at force_empty is too costly...)
4967  *
4968  * Instead of clearing all references at force_empty, we remember
4969  * the number of reference from swap_cgroup and free mem_cgroup when
4970  * it goes down to 0.
4971  *
4972  * Removal of cgroup itself succeeds regardless of refs from swap.
4973  */
4974
4975 static void __mem_cgroup_free(struct mem_cgroup *memcg)
4976 {
4977         int node;
4978
4979         mem_cgroup_remove_from_trees(memcg);
4980         free_css_id(&mem_cgroup_subsys, &memcg->css);
4981
4982         for_each_node_state(node, N_POSSIBLE)
4983                 free_mem_cgroup_per_zone_info(memcg, node);
4984
4985         free_percpu(memcg->stat);
4986         if (sizeof(struct mem_cgroup) < PAGE_SIZE)
4987                 kfree(memcg);
4988         else
4989                 vfree(memcg);
4990 }
4991
4992 static void mem_cgroup_get(struct mem_cgroup *memcg)
4993 {
4994         atomic_inc(&memcg->refcnt);
4995 }
4996
4997 static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
4998 {
4999         if (atomic_sub_and_test(count, &memcg->refcnt)) {
5000                 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5001                 __mem_cgroup_free(memcg);
5002                 if (parent)
5003                         mem_cgroup_put(parent);
5004         }
5005 }
5006
5007 static void mem_cgroup_put(struct mem_cgroup *memcg)
5008 {
5009         __mem_cgroup_put(memcg, 1);
5010 }
5011
5012 /*
5013  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
5014  */
5015 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
5016 {
5017         if (!memcg->res.parent)
5018                 return NULL;
5019         return mem_cgroup_from_res_counter(memcg->res.parent, res);
5020 }
5021 EXPORT_SYMBOL(parent_mem_cgroup);
5022
5023 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
5024 static void __init enable_swap_cgroup(void)
5025 {
5026         if (!mem_cgroup_disabled() && really_do_swap_account)
5027                 do_swap_account = 1;
5028 }
5029 #else
5030 static void __init enable_swap_cgroup(void)
5031 {
5032 }
5033 #endif
5034
5035 static int mem_cgroup_soft_limit_tree_init(void)
5036 {
5037         struct mem_cgroup_tree_per_node *rtpn;
5038         struct mem_cgroup_tree_per_zone *rtpz;
5039         int tmp, node, zone;
5040
5041         for_each_node_state(node, N_POSSIBLE) {
5042                 tmp = node;
5043                 if (!node_state(node, N_NORMAL_MEMORY))
5044                         tmp = -1;
5045                 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
5046                 if (!rtpn)
5047                         return 1;
5048
5049                 soft_limit_tree.rb_tree_per_node[node] = rtpn;
5050
5051                 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5052                         rtpz = &rtpn->rb_tree_per_zone[zone];
5053                         rtpz->rb_root = RB_ROOT;
5054                         spin_lock_init(&rtpz->lock);
5055                 }
5056         }
5057         return 0;
5058 }
5059
5060 static struct cgroup_subsys_state * __ref
5061 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
5062 {
5063         struct mem_cgroup *memcg, *parent;
5064         long error = -ENOMEM;
5065         int node;
5066
5067         memcg = mem_cgroup_alloc();
5068         if (!memcg)
5069                 return ERR_PTR(error);
5070
5071         for_each_node_state(node, N_POSSIBLE)
5072                 if (alloc_mem_cgroup_per_zone_info(memcg, node))
5073                         goto free_out;
5074
5075         /* root ? */
5076         if (cont->parent == NULL) {
5077                 int cpu;
5078                 enable_swap_cgroup();
5079                 parent = NULL;
5080                 if (mem_cgroup_soft_limit_tree_init())
5081                         goto free_out;
5082                 root_mem_cgroup = memcg;
5083                 for_each_possible_cpu(cpu) {
5084                         struct memcg_stock_pcp *stock =
5085                                                 &per_cpu(memcg_stock, cpu);
5086                         INIT_WORK(&stock->work, drain_local_stock);
5087                 }
5088                 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
5089         } else {
5090                 parent = mem_cgroup_from_cont(cont->parent);
5091                 memcg->use_hierarchy = parent->use_hierarchy;
5092                 memcg->oom_kill_disable = parent->oom_kill_disable;
5093         }
5094
5095         if (parent && parent->use_hierarchy) {
5096                 res_counter_init(&memcg->res, &parent->res);
5097                 res_counter_init(&memcg->memsw, &parent->memsw);
5098                 res_counter_init(&memcg->kmem, &parent->kmem);
5099                 /*
5100                  * We increment refcnt of the parent to ensure that we can
5101                  * safely access it on res_counter_charge/uncharge.
5102                  * This refcnt will be decremented when freeing this
5103                  * mem_cgroup(see mem_cgroup_put).
5104                  */
5105                 mem_cgroup_get(parent);
5106         } else {
5107                 res_counter_init(&memcg->res, NULL);
5108                 res_counter_init(&memcg->memsw, NULL);
5109                 res_counter_init(&memcg->kmem, NULL);
5110         }
5111         memcg->last_scanned_node = MAX_NUMNODES;
5112         INIT_LIST_HEAD(&memcg->oom_notify);
5113
5114         if (parent)
5115                 memcg->swappiness = mem_cgroup_swappiness(parent);
5116         atomic_set(&memcg->refcnt, 1);
5117         memcg->move_charge_at_immigrate = 0;
5118         mutex_init(&memcg->thresholds_lock);
5119         return &memcg->css;
5120 free_out:
5121         __mem_cgroup_free(memcg);
5122         return ERR_PTR(error);
5123 }
5124
5125 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
5126                                         struct cgroup *cont)
5127 {
5128         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5129
5130         return mem_cgroup_force_empty(memcg, false);
5131 }
5132
5133 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
5134                                 struct cgroup *cont)
5135 {
5136         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5137
5138         kmem_cgroup_destroy(ss, cont);
5139
5140         mem_cgroup_put(memcg);
5141 }
5142
5143 static int mem_cgroup_populate(struct cgroup_subsys *ss,
5144                                 struct cgroup *cont)
5145 {
5146         int ret;
5147
5148         ret = cgroup_add_files(cont, ss, mem_cgroup_files,
5149                                 ARRAY_SIZE(mem_cgroup_files));
5150
5151         if (!ret)
5152                 ret = register_memsw_files(cont, ss);
5153
5154         if (!ret)
5155                 ret = register_kmem_files(cont, ss);
5156
5157         return ret;
5158 }
5159
5160 #ifdef CONFIG_MMU
5161 /* Handlers for move charge at task migration. */
5162 #define PRECHARGE_COUNT_AT_ONCE 256
5163 static int mem_cgroup_do_precharge(unsigned long count)
5164 {
5165         int ret = 0;
5166         int batch_count = PRECHARGE_COUNT_AT_ONCE;
5167         struct mem_cgroup *memcg = mc.to;
5168
5169         if (mem_cgroup_is_root(memcg)) {
5170                 mc.precharge += count;
5171                 /* we don't need css_get for root */
5172                 return ret;
5173         }
5174         /* try to charge at once */
5175         if (count > 1) {
5176                 struct res_counter *dummy;
5177                 /*
5178                  * "memcg" cannot be under rmdir() because we've already checked
5179                  * by cgroup_lock_live_cgroup() that it is not removed and we
5180                  * are still under the same cgroup_mutex. So we can postpone
5181                  * css_get().
5182                  */
5183                 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
5184                         goto one_by_one;
5185                 if (do_swap_account && res_counter_charge(&memcg->memsw,
5186                                                 PAGE_SIZE * count, &dummy)) {
5187                         res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
5188                         goto one_by_one;
5189                 }
5190                 mc.precharge += count;
5191                 return ret;
5192         }
5193 one_by_one:
5194         /* fall back to one by one charge */
5195         while (count--) {
5196                 if (signal_pending(current)) {
5197                         ret = -EINTR;
5198                         break;
5199                 }
5200                 if (!batch_count--) {
5201                         batch_count = PRECHARGE_COUNT_AT_ONCE;
5202                         cond_resched();
5203                 }
5204                 ret = __mem_cgroup_try_charge(NULL,
5205                                         GFP_KERNEL, 1, &memcg, false);
5206                 if (ret || !memcg)
5207                         /* mem_cgroup_clear_mc() will do uncharge later */
5208                         return -ENOMEM;
5209                 mc.precharge++;
5210         }
5211         return ret;
5212 }
5213
5214 /**
5215  * is_target_pte_for_mc - check a pte whether it is valid for move charge
5216  * @vma: the vma the pte to be checked belongs
5217  * @addr: the address corresponding to the pte to be checked
5218  * @ptent: the pte to be checked
5219  * @target: the pointer the target page or swap ent will be stored(can be NULL)
5220  *
5221  * Returns
5222  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
5223  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
5224  *     move charge. if @target is not NULL, the page is stored in target->page
5225  *     with extra refcnt got(Callers should handle it).
5226  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
5227  *     target for charge migration. if @target is not NULL, the entry is stored
5228  *     in target->ent.
5229  *
5230  * Called with pte lock held.
5231  */
5232 union mc_target {
5233         struct page     *page;
5234         swp_entry_t     ent;
5235 };
5236
5237 enum mc_target_type {
5238         MC_TARGET_NONE, /* not used */
5239         MC_TARGET_PAGE,
5240         MC_TARGET_SWAP,
5241 };
5242
5243 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5244                                                 unsigned long addr, pte_t ptent)
5245 {
5246         struct page *page = vm_normal_page(vma, addr, ptent);
5247
5248         if (!page || !page_mapped(page))
5249                 return NULL;
5250         if (PageAnon(page)) {
5251                 /* we don't move shared anon */
5252                 if (!move_anon() || page_mapcount(page) > 2)
5253                         return NULL;
5254         } else if (!move_file())
5255                 /* we ignore mapcount for file pages */
5256                 return NULL;
5257         if (!get_page_unless_zero(page))
5258                 return NULL;
5259
5260         return page;
5261 }
5262
5263 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5264                         unsigned long addr, pte_t ptent, swp_entry_t *entry)
5265 {
5266         int usage_count;
5267         struct page *page = NULL;
5268         swp_entry_t ent = pte_to_swp_entry(ptent);
5269
5270         if (!move_anon() || non_swap_entry(ent))
5271                 return NULL;
5272         usage_count = mem_cgroup_count_swap_user(ent, &page);
5273         if (usage_count > 1) { /* we don't move shared anon */
5274                 if (page)
5275                         put_page(page);
5276                 return NULL;
5277         }
5278         if (do_swap_account)
5279                 entry->val = ent.val;
5280
5281         return page;
5282 }
5283
5284 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5285                         unsigned long addr, pte_t ptent, swp_entry_t *entry)
5286 {
5287         struct page *page = NULL;
5288         struct inode *inode;
5289         struct address_space *mapping;
5290         pgoff_t pgoff;
5291
5292         if (!vma->vm_file) /* anonymous vma */
5293                 return NULL;
5294         if (!move_file())
5295                 return NULL;
5296
5297         inode = vma->vm_file->f_path.dentry->d_inode;
5298         mapping = vma->vm_file->f_mapping;
5299         if (pte_none(ptent))
5300                 pgoff = linear_page_index(vma, addr);
5301         else /* pte_file(ptent) is true */
5302                 pgoff = pte_to_pgoff(ptent);
5303
5304         /* page is moved even if it's not RSS of this task(page-faulted). */
5305         page = find_get_page(mapping, pgoff);
5306
5307 #ifdef CONFIG_SWAP
5308         /* shmem/tmpfs may report page out on swap: account for that too. */
5309         if (radix_tree_exceptional_entry(page)) {
5310                 swp_entry_t swap = radix_to_swp_entry(page);
5311                 if (do_swap_account)
5312                         *entry = swap;
5313                 page = find_get_page(&swapper_space, swap.val);
5314         }
5315 #endif
5316         return page;
5317 }
5318
5319 static int is_target_pte_for_mc(struct vm_area_struct *vma,
5320                 unsigned long addr, pte_t ptent, union mc_target *target)
5321 {
5322         struct page *page = NULL;
5323         struct page_cgroup *pc;
5324         int ret = 0;
5325         swp_entry_t ent = { .val = 0 };
5326
5327         if (pte_present(ptent))
5328                 page = mc_handle_present_pte(vma, addr, ptent);
5329         else if (is_swap_pte(ptent))
5330                 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
5331         else if (pte_none(ptent) || pte_file(ptent))
5332                 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5333
5334         if (!page && !ent.val)
5335                 return 0;
5336         if (page) {
5337                 pc = lookup_page_cgroup(page);
5338                 /*
5339                  * Do only loose check w/o page_cgroup lock.
5340                  * mem_cgroup_move_account() checks the pc is valid or not under
5341                  * the lock.
5342                  */
5343                 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5344                         ret = MC_TARGET_PAGE;
5345                         if (target)
5346                                 target->page = page;
5347                 }
5348                 if (!ret || !target)
5349                         put_page(page);
5350         }
5351         /* There is a swap entry and a page doesn't exist or isn't charged */
5352         if (ent.val && !ret &&
5353                         css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
5354                 ret = MC_TARGET_SWAP;
5355                 if (target)
5356                         target->ent = ent;
5357         }
5358         return ret;
5359 }
5360
5361 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5362                                         unsigned long addr, unsigned long end,
5363                                         struct mm_walk *walk)
5364 {
5365         struct vm_area_struct *vma = walk->private;
5366         pte_t *pte;
5367         spinlock_t *ptl;
5368
5369         split_huge_page_pmd(walk->mm, pmd);
5370
5371         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5372         for (; addr != end; pte++, addr += PAGE_SIZE)
5373                 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
5374                         mc.precharge++; /* increment precharge temporarily */
5375         pte_unmap_unlock(pte - 1, ptl);
5376         cond_resched();
5377
5378         return 0;
5379 }
5380
5381 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5382 {
5383         unsigned long precharge;
5384         struct vm_area_struct *vma;
5385
5386         down_read(&mm->mmap_sem);
5387         for (vma = mm->mmap; vma; vma = vma->vm_next) {
5388                 struct mm_walk mem_cgroup_count_precharge_walk = {
5389                         .pmd_entry = mem_cgroup_count_precharge_pte_range,
5390                         .mm = mm,
5391                         .private = vma,
5392                 };
5393                 if (is_vm_hugetlb_page(vma))
5394                         continue;
5395                 walk_page_range(vma->vm_start, vma->vm_end,
5396                                         &mem_cgroup_count_precharge_walk);
5397         }
5398         up_read(&mm->mmap_sem);
5399
5400         precharge = mc.precharge;
5401         mc.precharge = 0;
5402
5403         return precharge;
5404 }
5405
5406 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5407 {
5408         unsigned long precharge = mem_cgroup_count_precharge(mm);
5409
5410         VM_BUG_ON(mc.moving_task);
5411         mc.moving_task = current;
5412         return mem_cgroup_do_precharge(precharge);
5413 }
5414
5415 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
5416 static void __mem_cgroup_clear_mc(void)
5417 {
5418         struct mem_cgroup *from = mc.from;
5419         struct mem_cgroup *to = mc.to;
5420
5421         /* we must uncharge all the leftover precharges from mc.to */
5422         if (mc.precharge) {
5423                 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
5424                 mc.precharge = 0;
5425         }
5426         /*
5427          * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
5428          * we must uncharge here.
5429          */
5430         if (mc.moved_charge) {
5431                 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
5432                 mc.moved_charge = 0;
5433         }
5434         /* we must fixup refcnts and charges */
5435         if (mc.moved_swap) {
5436                 /* uncharge swap account from the old cgroup */
5437                 if (!mem_cgroup_is_root(mc.from))
5438                         res_counter_uncharge(&mc.from->memsw,
5439                                                 PAGE_SIZE * mc.moved_swap);
5440                 __mem_cgroup_put(mc.from, mc.moved_swap);
5441
5442                 if (!mem_cgroup_is_root(mc.to)) {
5443                         /*
5444                          * we charged both to->res and to->memsw, so we should
5445                          * uncharge to->res.
5446                          */
5447                         res_counter_uncharge(&mc.to->res,
5448                                                 PAGE_SIZE * mc.moved_swap);
5449                 }
5450                 /* we've already done mem_cgroup_get(mc.to) */
5451                 mc.moved_swap = 0;
5452         }
5453         memcg_oom_recover(from);
5454         memcg_oom_recover(to);
5455         wake_up_all(&mc.waitq);
5456 }
5457
5458 static void mem_cgroup_clear_mc(void)
5459 {
5460         struct mem_cgroup *from = mc.from;
5461
5462         /*
5463          * we must clear moving_task before waking up waiters at the end of
5464          * task migration.
5465          */
5466         mc.moving_task = NULL;
5467         __mem_cgroup_clear_mc();
5468         spin_lock(&mc.lock);
5469         mc.from = NULL;
5470         mc.to = NULL;
5471         spin_unlock(&mc.lock);
5472         mem_cgroup_end_move(from);
5473 }
5474
5475 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5476                                 struct cgroup *cgroup,
5477                                 struct cgroup_taskset *tset)
5478 {
5479         struct task_struct *p = cgroup_taskset_first(tset);
5480         int ret = 0;
5481         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
5482
5483         if (memcg->move_charge_at_immigrate) {
5484                 struct mm_struct *mm;
5485                 struct mem_cgroup *from = mem_cgroup_from_task(p);
5486
5487                 VM_BUG_ON(from == memcg);
5488
5489                 mm = get_task_mm(p);
5490                 if (!mm)
5491                         return 0;
5492                 /* We move charges only when we move a owner of the mm */
5493                 if (mm->owner == p) {
5494                         VM_BUG_ON(mc.from);
5495                         VM_BUG_ON(mc.to);
5496                         VM_BUG_ON(mc.precharge);
5497                         VM_BUG_ON(mc.moved_charge);
5498                         VM_BUG_ON(mc.moved_swap);
5499                         mem_cgroup_start_move(from);
5500                         spin_lock(&mc.lock);
5501                         mc.from = from;
5502                         mc.to = memcg;
5503                         spin_unlock(&mc.lock);
5504                         /* We set mc.moving_task later */
5505
5506                         ret = mem_cgroup_precharge_mc(mm);
5507                         if (ret)
5508                                 mem_cgroup_clear_mc();
5509                 }
5510                 mmput(mm);
5511         }
5512         return ret;
5513 }
5514
5515 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5516                                 struct cgroup *cgroup,
5517                                 struct cgroup_taskset *tset)
5518 {
5519         mem_cgroup_clear_mc();
5520 }
5521
5522 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5523                                 unsigned long addr, unsigned long end,
5524                                 struct mm_walk *walk)
5525 {
5526         int ret = 0;
5527         struct vm_area_struct *vma = walk->private;
5528         pte_t *pte;
5529         spinlock_t *ptl;
5530
5531         split_huge_page_pmd(walk->mm, pmd);
5532 retry:
5533         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5534         for (; addr != end; addr += PAGE_SIZE) {
5535                 pte_t ptent = *(pte++);
5536                 union mc_target target;
5537                 int type;
5538                 struct page *page;
5539                 struct page_cgroup *pc;
5540                 swp_entry_t ent;
5541
5542                 if (!mc.precharge)
5543                         break;
5544
5545                 type = is_target_pte_for_mc(vma, addr, ptent, &target);
5546                 switch (type) {
5547                 case MC_TARGET_PAGE:
5548                         page = target.page;
5549                         if (isolate_lru_page(page))
5550                                 goto put;
5551                         pc = lookup_page_cgroup(page);
5552                         if (!mem_cgroup_move_account(page, 1, pc,
5553                                                      mc.from, mc.to, false)) {
5554                                 mc.precharge--;
5555                                 /* we uncharge from mc.from later. */
5556                                 mc.moved_charge++;
5557                         }
5558                         putback_lru_page(page);
5559 put:                    /* is_target_pte_for_mc() gets the page */
5560                         put_page(page);
5561                         break;
5562                 case MC_TARGET_SWAP:
5563                         ent = target.ent;
5564                         if (!mem_cgroup_move_swap_account(ent,
5565                                                 mc.from, mc.to, false)) {
5566                                 mc.precharge--;
5567                                 /* we fixup refcnts and charges later. */
5568                                 mc.moved_swap++;
5569                         }
5570                         break;
5571                 default:
5572                         break;
5573                 }
5574         }
5575         pte_unmap_unlock(pte - 1, ptl);
5576         cond_resched();
5577
5578         if (addr != end) {
5579                 /*
5580                  * We have consumed all precharges we got in can_attach().
5581                  * We try charge one by one, but don't do any additional
5582                  * charges to mc.to if we have failed in charge once in attach()
5583                  * phase.
5584                  */
5585                 ret = mem_cgroup_do_precharge(1);
5586                 if (!ret)
5587                         goto retry;
5588         }
5589
5590         return ret;
5591 }
5592
5593 static void mem_cgroup_move_charge(struct mm_struct *mm)
5594 {
5595         struct vm_area_struct *vma;
5596
5597         lru_add_drain_all();
5598 retry:
5599         if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
5600                 /*
5601                  * Someone who are holding the mmap_sem might be waiting in
5602                  * waitq. So we cancel all extra charges, wake up all waiters,
5603                  * and retry. Because we cancel precharges, we might not be able
5604                  * to move enough charges, but moving charge is a best-effort
5605                  * feature anyway, so it wouldn't be a big problem.
5606                  */
5607                 __mem_cgroup_clear_mc();
5608                 cond_resched();
5609                 goto retry;
5610         }
5611         for (vma = mm->mmap; vma; vma = vma->vm_next) {
5612                 int ret;
5613                 struct mm_walk mem_cgroup_move_charge_walk = {
5614                         .pmd_entry = mem_cgroup_move_charge_pte_range,
5615                         .mm = mm,
5616                         .private = vma,
5617                 };
5618                 if (is_vm_hugetlb_page(vma))
5619                         continue;
5620                 ret = walk_page_range(vma->vm_start, vma->vm_end,
5621                                                 &mem_cgroup_move_charge_walk);
5622                 if (ret)
5623                         /*
5624                          * means we have consumed all precharges and failed in
5625                          * doing additional charge. Just abandon here.
5626                          */
5627                         break;
5628         }
5629         up_read(&mm->mmap_sem);
5630 }
5631
5632 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5633                                 struct cgroup *cont,
5634                                 struct cgroup_taskset *tset)
5635 {
5636         struct task_struct *p = cgroup_taskset_first(tset);
5637         struct mm_struct *mm = get_task_mm(p);
5638
5639         if (mm) {
5640                 if (mc.to)
5641                         mem_cgroup_move_charge(mm);
5642                 put_swap_token(mm);
5643                 mmput(mm);
5644         }
5645         if (mc.to)
5646                 mem_cgroup_clear_mc();
5647 }
5648 #else   /* !CONFIG_MMU */
5649 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5650                                 struct cgroup *cgroup,
5651                                 struct cgroup_taskset *tset)
5652 {
5653         return 0;
5654 }
5655 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5656                                 struct cgroup *cgroup,
5657                                 struct cgroup_taskset *tset)
5658 {
5659 }
5660 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5661                                 struct cgroup *cont,
5662                                 struct cgroup_taskset *tset)
5663 {
5664 }
5665 #endif
5666
5667 struct cgroup_subsys mem_cgroup_subsys = {
5668         .name = "memory",
5669         .subsys_id = mem_cgroup_subsys_id,
5670         .create = mem_cgroup_create,
5671         .pre_destroy = mem_cgroup_pre_destroy,
5672         .destroy = mem_cgroup_destroy,
5673         .populate = mem_cgroup_populate,
5674         .can_attach = mem_cgroup_can_attach,
5675         .cancel_attach = mem_cgroup_cancel_attach,
5676         .attach = mem_cgroup_move_task,
5677         .early_init = 0,
5678         .use_id = 1,
5679 };
5680
5681 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
5682 static int __init enable_swap_account(char *s)
5683 {
5684         /* consider enabled if no parameter or 1 is given */
5685         if (!strcmp(s, "1"))
5686                 really_do_swap_account = 1;
5687         else if (!strcmp(s, "0"))
5688                 really_do_swap_account = 0;
5689         return 1;
5690 }
5691 __setup("swapaccount=", enable_swap_account);
5692
5693 #endif