mm: close race between do_fault_around() and fault_around_bytes_set()

[karo-tx-linux.git] / mm / oom_kill.c
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index 1e4a600a6163645897a42defaf21f437fdf431a6..1e11df8fa7ecaecd274a3d0aaa1fe0aea4bb38ab 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock);
  #ifdef CONFIG_NUMA
  /**
   * has_intersects_mems_allowed() - check task eligiblity for kill
- * @tsk: task struct of which task to consider
+ * @start: task struct of which task to consider
   * @mask: nodemask passed to page allocator for mempolicy ooms
   *
   * Task eligibility is determined by whether or not a candidate task, @tsk,
   * shares the same mempolicy nodes as current if it is bound by such a policy
   * and whether or not it has the same set of allowed cpuset nodes.
   */
-static bool has_intersects_mems_allowed(struct task_struct *tsk,
+static bool has_intersects_mems_allowed(struct task_struct *start,
                                         const nodemask_t *mask)
  {
-       struct task_struct *start = tsk;
+       struct task_struct *tsk;
+       bool ret = false;
  
-       do {
+       rcu_read_lock();
+       for_each_thread(start, tsk) {
                 if (mask) {
                         /*
                          * If this is a mempolicy constrained oom, tsk's
@@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
                          * mempolicy intersects current, otherwise it may be
                          * needlessly killed.
                          */
-                       if (mempolicy_nodemask_intersects(tsk, mask))
-                               return true;
+                       ret = mempolicy_nodemask_intersects(tsk, mask);
                 } else {
                         /*
                          * This is not a mempolicy constrained oom, so only
                          * check the mems of tsk's cpuset.
                          */
-                       if (cpuset_mems_allowed_intersects(current, tsk))
-                               return true;
+                       ret = cpuset_mems_allowed_intersects(current, tsk);
                 }
-       } while_each_thread(start, tsk);
+               if (ret)
+                       break;
+       }
+       rcu_read_unlock();
  
-       return false;
+       return ret;
  }
  #else
  static bool has_intersects_mems_allowed(struct task_struct *tsk,
@@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
   */
  struct task_struct *find_lock_task_mm(struct task_struct *p)
  {
-       struct task_struct *t = p;
+       struct task_struct *t;
  
-       do {
+       rcu_read_lock();
+
+       for_each_thread(p, t) {
                 task_lock(t);
                 if (likely(t->mm))
-                       return t;
+                       goto found;
                 task_unlock(t);
-       } while_each_thread(p, t);
+       }
+       t = NULL;
+found:
+       rcu_read_unlock();
  
-       return NULL;
+       return t;
  }
  
  /* return true if the task is not adequate as candidate victim task. */
@@ -170,7 +178,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
          * implementation used by LSMs.
          */
         if (has_capability_noaudit(p, CAP_SYS_ADMIN))
-               adj -= 30;
+               points -= (points * 3) / 100;
  
         /* Normalize to oom_score_adj units */
         adj *= totalpages / 1000;
@@ -250,8 +258,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
                 unsigned long totalpages, const nodemask_t *nodemask,
                 bool force_kill)
  {
-       if (task->exit_state)
-               return OOM_SCAN_CONTINUE;
         if (oom_unkillable_task(task, NULL, nodemask))
                 return OOM_SCAN_CONTINUE;
  
@@ -301,7 +307,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
         unsigned long chosen_points = 0;
  
         rcu_read_lock();
-       do_each_thread(g, p) {
+       for_each_process_thread(g, p) {
                 unsigned int points;
  
                 switch (oom_scan_process_thread(p, totalpages, nodemask,
@@ -319,11 +325,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                         break;
                 };
                 points = oom_badness(p, NULL, nodemask, totalpages);
-               if (points > chosen_points) {
-                       chosen = p;
-                       chosen_points = points;
-               }
-       } while_each_thread(g, p);
+               if (!points || points < chosen_points)
+                       continue;
+               /* Prefer thread group leaders for display purposes */
+               if (points == chosen_points && thread_group_leader(chosen))
+                       continue;
+
+               chosen = p;
+               chosen_points = points;
+       }
         if (chosen)
                 get_task_struct(chosen);
         rcu_read_unlock();
@@ -406,7 +416,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
  {
         struct task_struct *victim = p;
         struct task_struct *child;
-       struct task_struct *t = p;
+       struct task_struct *t;
         struct mm_struct *mm;
         unsigned int victim_points = 0;
         static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -437,7 +447,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
          * still freeing memory.
          */
         read_lock(&tasklist_lock);
-       do {
+       for_each_thread(p, t) {
                 list_for_each_entry(child, &t->children, sibling) {
                         unsigned int child_points;
  
@@ -455,13 +465,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                                 get_task_struct(victim);
                         }
                 }
-       } while_each_thread(p, t);
+       }
         read_unlock(&tasklist_lock);
  
-       rcu_read_lock();
         p = find_lock_task_mm(victim);
         if (!p) {
-               rcu_read_unlock();
                 put_task_struct(victim);
                 return;
         } else if (victim != p) {
@@ -487,6 +495,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
          * That thread will now get access to memory reserves since it has a
          * pending fatal signal.
          */
+       rcu_read_lock();
         for_each_process(p)
                 if (p->mm == mm && !same_thread_group(p, victim) &&
                     !(p->flags & PF_KTHREAD)) {
@@ -548,28 +557,25 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
   * if a parallel OOM killing is already taking place that includes a zone in
   * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
   */
-int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
  {
         struct zoneref *z;
         struct zone *zone;
-       int ret = 1;
+       bool ret = true;
  
         spin_lock(&zone_scan_lock);
-       for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+       for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
                 if (zone_is_oom_locked(zone)) {
-                       ret = 0;
+                       ret = false;
                         goto out;
                 }
-       }
  
-       for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
-               /*
-                * Lock each zone in the zonelist under zone_scan_lock so a
-                * parallel invocation of try_set_zonelist_oom() doesn't succeed
-                * when it shouldn't.
-                */
+       /*
+        * Lock each zone in the zonelist under zone_scan_lock so a parallel
+        * call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
+        */
+       for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
                 zone_set_flag(zone, ZONE_OOM_LOCKED);
-       }
  
  out:
         spin_unlock(&zone_scan_lock);
@@ -581,15 +587,14 @@ out:
   * allocation attempts with zonelists containing them may now recall the OOM
   * killer, if necessary.
   */
-void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
  {
         struct zoneref *z;
         struct zone *zone;
  
         spin_lock(&zone_scan_lock);
-       for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+       for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
                 zone_clear_flag(zone, ZONE_OOM_LOCKED);
-       }
         spin_unlock(&zone_scan_lock);
  }
  
@@ -683,9 +688,9 @@ void pagefault_out_of_memory(void)
         if (mem_cgroup_oom_synchronize(true))
                 return;
  
-       zonelist = node_zonelist(first_online_node, GFP_KERNEL);
-       if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
+       zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
+       if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
                 out_of_memory(NULL, 0, 0, NULL, false);
-               clear_zonelist_oom(zonelist, GFP_KERNEL);
+               oom_zonelist_unlock(zonelist, GFP_KERNEL);
         }
  }