Merge remote-tracking branch 'aio/master'

[karo-tx-linux.git] / fs / aio.c
diff --git a/fs/aio.c b/fs/aio.c

index 4918b896829217b56fc61fa0119bac461a0e1d57..5974090dc522b92261fe43552941ba9e2001a90e 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -112,6 +112,11 @@ struct kioctx {
  
         struct work_struct      free_work;
  
+       /*
+        * signals when all in-flight requests are done
+        */
+       struct completion *requests_done;
+
         struct {
                 /*
                  * This counts the number of available slots in the ringbuffer,
@@ -472,7 +477,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
  }
  EXPORT_SYMBOL(kiocb_set_cancel_fn);
  
-static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb)
+static int kiocb_cancel(struct kiocb *kiocb)
  {
         kiocb_cancel_fn *old, *cancel;
  
@@ -508,6 +513,10 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
  {
         struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
  
+       /* At this point we know that there are no any in-flight requests */
+       if (ctx->requests_done)
+               complete(ctx->requests_done);
+
         INIT_WORK(&ctx->free_work, free_ioctx);
         schedule_work(&ctx->free_work);
  }
@@ -529,7 +538,7 @@ static void free_ioctx_users(struct percpu_ref *ref)
                                        struct kiocb, ki_list);
  
                 list_del_init(&req->ki_list);
-               kiocb_cancel(ctx, req);
+               kiocb_cancel(req);
         }
  
         spin_unlock_irq(&ctx->ctx_lock);
@@ -718,37 +727,42 @@ err:
   *     when the processes owning a context have all exited to encourage
   *     the rapid destruction of the kioctx.
   */
-static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
+static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
+               struct completion *requests_done)
  {
-       if (!atomic_xchg(&ctx->dead, 1)) {
-               struct kioctx_table *table;
+       struct kioctx_table *table;
  
-               spin_lock(&mm->ioctx_lock);
-               rcu_read_lock();
-               table = rcu_dereference(mm->ioctx_table);
+       if (atomic_xchg(&ctx->dead, 1))
+               return -EINVAL;
  
-               WARN_ON(ctx != table->table[ctx->id]);
-               table->table[ctx->id] = NULL;
-               rcu_read_unlock();
-               spin_unlock(&mm->ioctx_lock);
  
-               /* percpu_ref_kill() will do the necessary call_rcu() */
-               wake_up_all(&ctx->wait);
+       spin_lock(&mm->ioctx_lock);
+       rcu_read_lock();
+       table = rcu_dereference(mm->ioctx_table);
  
-               /*
-                * It'd be more correct to do this in free_ioctx(), after all
-                * the outstanding kiocbs have finished - but by then io_destroy
-                * has already returned, so io_setup() could potentially return
-                * -EAGAIN with no ioctxs actually in use (as far as userspace
-                *  could tell).
-                */
-               aio_nr_sub(ctx->max_reqs);
+       WARN_ON(ctx != table->table[ctx->id]);
+       table->table[ctx->id] = NULL;
+       rcu_read_unlock();
+       spin_unlock(&mm->ioctx_lock);
  
-               if (ctx->mmap_size)
-                       vm_munmap(ctx->mmap_base, ctx->mmap_size);
+       /* percpu_ref_kill() will do the necessary call_rcu() */
+       wake_up_all(&ctx->wait);
  
-               percpu_ref_kill(&ctx->users);
-       }
+       /*
+        * It'd be more correct to do this in free_ioctx(), after all
+        * the outstanding kiocbs have finished - but by then io_destroy
+        * has already returned, so io_setup() could potentially return
+        * -EAGAIN with no ioctxs actually in use (as far as userspace
+        *  could tell).
+        */
+       aio_nr_sub(ctx->max_reqs);
+
+       if (ctx->mmap_size)
+               vm_munmap(ctx->mmap_base, ctx->mmap_size);
+
+       ctx->requests_done = requests_done;
+       percpu_ref_kill(&ctx->users);
+       return 0;
  }
  
  /* wait_on_sync_kiocb:
@@ -809,7 +823,7 @@ void exit_aio(struct mm_struct *mm)
                  */
                 ctx->mmap_size = 0;
  
-               kill_ioctx(mm, ctx);
+               kill_ioctx(mm, ctx, NULL);
         }
  }
  
@@ -1185,7 +1199,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
         if (!IS_ERR(ioctx)) {
                 ret = put_user(ioctx->user_id, ctxp);
                 if (ret)
-                       kill_ioctx(current->mm, ioctx);
+                       kill_ioctx(current->mm, ioctx, NULL);
                 percpu_ref_put(&ioctx->users);
         }
  
@@ -1203,9 +1217,25 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
  {
         struct kioctx *ioctx = lookup_ioctx(ctx);
         if (likely(NULL != ioctx)) {
-               kill_ioctx(current->mm, ioctx);
+               struct completion requests_done =
+                       COMPLETION_INITIALIZER_ONSTACK(requests_done);
+               int ret;
+
+               /* Pass requests_done to kill_ioctx() where it can be set
+                * in a thread-safe way. If we try to set it here then we have
+                * a race condition if two io_destroy() called simultaneously.
+                */
+               ret = kill_ioctx(current->mm, ioctx, &requests_done);
                 percpu_ref_put(&ioctx->users);
-               return 0;
+
+               /* Wait until all IO for the context are done. Otherwise kernel
+                * keep using user-space buffers even if user thinks the context
+                * is destroyed.
+                */
+               if (!ret)
+                       wait_for_completion(&requests_done);
+
+               return ret;
         }
         pr_debug("EINVAL: io_destroy: invalid context id\n");
         return -EINVAL;
@@ -1569,7 +1599,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
  
         kiocb = lookup_kiocb(ctx, iocb, key);
         if (kiocb)
-               ret = kiocb_cancel(ctx, kiocb);
+               ret = kiocb_cancel(kiocb);
         else
                 ret = -EINVAL;