sparc64: Add 64K page size support

[karo-tx-linux.git] / fs / userfaultfd.c
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c

index 87d31921b66cdd3072a4f1beb74e05ce49ef3caf..18406158e13fbf5e9b4e7041489f2d20e404c075 100644 (file)
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -27,6 +27,7 @@
  #include <linux/mempolicy.h>
  #include <linux/ioctl.h>
  #include <linux/security.h>
+#include <linux/hugetlb.h>
  
  static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
  
@@ -64,6 +65,12 @@ struct userfaultfd_ctx {
         struct mm_struct *mm;
  };
  
+struct userfaultfd_fork_ctx {
+       struct userfaultfd_ctx *orig;
+       struct userfaultfd_ctx *new;
+       struct list_head list;
+};
+
  struct userfaultfd_wait_queue {
         struct uffd_msg msg;
         wait_queue_t wq;
@@ -195,6 +202,49 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
         return msg;
  }
  
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Same functionality as userfaultfd_must_wait below with modifications for
+ * hugepmd ranges.
+ */
+static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+                                        unsigned long address,
+                                        unsigned long flags,
+                                        unsigned long reason)
+{
+       struct mm_struct *mm = ctx->mm;
+       pte_t *pte;
+       bool ret = true;
+
+       VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+       pte = huge_pte_offset(mm, address);
+       if (!pte)
+               goto out;
+
+       ret = false;
+
+       /*
+        * Lockless access: we're in a wait_event so it's ok if it
+        * changes under us.
+        */
+       if (huge_pte_none(*pte))
+               ret = true;
+       if (!huge_pte_write(*pte) && (reason & VM_UFFD_WP))
+               ret = true;
+out:
+       return ret;
+}
+#else
+static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+                                        unsigned long address,
+                                        unsigned long flags,
+                                        unsigned long reason)
+{
+       return false;   /* should never get here */
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
  /*
   * Verify the pagetables are still not ok after having reigstered into
   * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
@@ -371,8 +421,12 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
         set_current_state(blocking_state);
         spin_unlock(&ctx->fault_pending_wqh.lock);
  
-       must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
-                                         reason);
+       if (!is_vm_hugetlb_page(vmf->vma))
+               must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
+                                                 reason);
+       else
+               must_wait = userfaultfd_huge_must_wait(ctx, vmf->address,
+                                                      vmf->flags, reason);
         up_read(&mm->mmap_sem);
  
         if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
@@ -465,9 +519,8 @@ out:
         return ret;
  }
  
-static int __maybe_unused userfaultfd_event_wait_completion(
-               struct userfaultfd_ctx *ctx,
-               struct userfaultfd_wait_queue *ewq)
+static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
+                                            struct userfaultfd_wait_queue *ewq)
  {
         int ret = 0;
  
@@ -518,6 +571,144 @@ static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
         __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
  }
  
+int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
+{
+       struct userfaultfd_ctx *ctx = NULL, *octx;
+       struct userfaultfd_fork_ctx *fctx;
+
+       octx = vma->vm_userfaultfd_ctx.ctx;
+       if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+               vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+               return 0;
+       }
+
+       list_for_each_entry(fctx, fcs, list)
+               if (fctx->orig == octx) {
+                       ctx = fctx->new;
+                       break;
+               }
+
+       if (!ctx) {
+               fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
+               if (!fctx)
+                       return -ENOMEM;
+
+               ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+               if (!ctx) {
+                       kfree(fctx);
+                       return -ENOMEM;
+               }
+
+               atomic_set(&ctx->refcount, 1);
+               ctx->flags = octx->flags;
+               ctx->state = UFFD_STATE_RUNNING;
+               ctx->features = octx->features;
+               ctx->released = false;
+               ctx->mm = vma->vm_mm;
+               atomic_inc(&ctx->mm->mm_count);
+
+               userfaultfd_ctx_get(octx);
+               fctx->orig = octx;
+               fctx->new = ctx;
+               list_add_tail(&fctx->list, fcs);
+       }
+
+       vma->vm_userfaultfd_ctx.ctx = ctx;
+       return 0;
+}
+
+static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
+{
+       struct userfaultfd_ctx *ctx = fctx->orig;
+       struct userfaultfd_wait_queue ewq;
+
+       msg_init(&ewq.msg);
+
+       ewq.msg.event = UFFD_EVENT_FORK;
+       ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
+
+       return userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
+void dup_userfaultfd_complete(struct list_head *fcs)
+{
+       int ret = 0;
+       struct userfaultfd_fork_ctx *fctx, *n;
+
+       list_for_each_entry_safe(fctx, n, fcs, list) {
+               if (!ret)
+                       ret = dup_fctx(fctx);
+               list_del(&fctx->list);
+               kfree(fctx);
+       }
+}
+
+void mremap_userfaultfd_prep(struct vm_area_struct *vma,
+                            struct vm_userfaultfd_ctx *vm_ctx)
+{
+       struct userfaultfd_ctx *ctx;
+
+       ctx = vma->vm_userfaultfd_ctx.ctx;
+       if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
+               vm_ctx->ctx = ctx;
+               userfaultfd_ctx_get(ctx);
+       }
+}
+
+void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
+                                unsigned long from, unsigned long to,
+                                unsigned long len)
+{
+       struct userfaultfd_ctx *ctx = vm_ctx->ctx;
+       struct userfaultfd_wait_queue ewq;
+
+       if (!ctx)
+               return;
+
+       if (to & ~PAGE_MASK) {
+               userfaultfd_ctx_put(ctx);
+               return;
+       }
+
+       msg_init(&ewq.msg);
+
+       ewq.msg.event = UFFD_EVENT_REMAP;
+       ewq.msg.arg.remap.from = from;
+       ewq.msg.arg.remap.to = to;
+       ewq.msg.arg.remap.len = len;
+
+       userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
+void madvise_userfault_dontneed(struct vm_area_struct *vma,
+                               struct vm_area_struct **prev,
+                               unsigned long start, unsigned long end)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       struct userfaultfd_ctx *ctx;
+       struct userfaultfd_wait_queue ewq;
+
+       ctx = vma->vm_userfaultfd_ctx.ctx;
+       if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_MADVDONTNEED))
+               return;
+
+       userfaultfd_ctx_get(ctx);
+       up_read(&mm->mmap_sem);
+
+       *prev = NULL; /* We wait for ACK w/o the mmap semaphore */
+
+       msg_init(&ewq.msg);
+
+       ewq.msg.event = UFFD_EVENT_MADVDONTNEED;
+       ewq.msg.arg.madv_dn.start = start;
+       ewq.msg.arg.madv_dn.end = end;
+
+       userfaultfd_event_wait_completion(ctx, &ewq);
+
+       down_read(&mm->mmap_sem);
+}
+
  static int userfaultfd_release(struct inode *inode, struct file *file)
  {
         struct userfaultfd_ctx *ctx = file->private_data;
@@ -653,12 +844,49 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
         }
  }
  
+static const struct file_operations userfaultfd_fops;
+
+static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
+                                 struct userfaultfd_ctx *new,
+                                 struct uffd_msg *msg)
+{
+       int fd;
+       struct file *file;
+       unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
+
+       fd = get_unused_fd_flags(flags);
+       if (fd < 0)
+               return fd;
+
+       file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
+                                 O_RDWR | flags);
+       if (IS_ERR(file)) {
+               put_unused_fd(fd);
+               return PTR_ERR(file);
+       }
+
+       fd_install(fd, file);
+       msg->arg.reserved.reserved1 = 0;
+       msg->arg.fork.ufd = fd;
+
+       return 0;
+}
+
  static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
                                     struct uffd_msg *msg)
  {
         ssize_t ret;
         DECLARE_WAITQUEUE(wait, current);
         struct userfaultfd_wait_queue *uwq;
+       /*
+        * Handling fork event requires sleeping operations, so
+        * we drop the event_wqh lock, then do these ops, then
+        * lock it back and wake up the waiter. While the lock is
+        * dropped the ewq may go away so we keep track of it
+        * carefully.
+        */
+       LIST_HEAD(fork_event);
+       struct userfaultfd_ctx *fork_nctx = NULL;
  
         /* always take the fd_wqh lock before the fault_pending_wqh lock */
         spin_lock(&ctx->fd_wqh.lock);
@@ -716,6 +944,16 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
                 if (uwq) {
                         *msg = uwq->msg;
  
+                       if (uwq->msg.event == UFFD_EVENT_FORK) {
+                               fork_nctx = (struct userfaultfd_ctx *)
+                                       (unsigned long)
+                                       uwq->msg.arg.reserved.reserved1;
+                               list_move(&uwq->wq.task_list, &fork_event);
+                               spin_unlock(&ctx->event_wqh.lock);
+                               ret = 0;
+                               break;
+                       }
+
                         userfaultfd_event_complete(ctx, uwq);
                         spin_unlock(&ctx->event_wqh.lock);
                         ret = 0;
@@ -739,6 +977,23 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
         __set_current_state(TASK_RUNNING);
         spin_unlock(&ctx->fd_wqh.lock);
  
+       if (!ret && msg->event == UFFD_EVENT_FORK) {
+               ret = resolve_userfault_fork(ctx, fork_nctx, msg);
+
+               if (!ret) {
+                       spin_lock(&ctx->event_wqh.lock);
+                       if (!list_empty(&fork_event)) {
+                               uwq = list_first_entry(&fork_event,
+                                                      typeof(*uwq),
+                                                      wq.task_list);
+                               list_del(&uwq->wq.task_list);
+                               __add_wait_queue(&ctx->event_wqh, &uwq->wq);
+                               userfaultfd_event_complete(ctx, uwq);
+                       }
+                       spin_unlock(&ctx->event_wqh.lock);
+               }
+       }
+
         return ret;
  }
  
@@ -841,6 +1096,12 @@ static __always_inline int validate_range(struct mm_struct *mm,
         return 0;
  }
  
+static inline bool vma_can_userfault(struct vm_area_struct *vma)
+{
+       return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
+               vma_is_shmem(vma);
+}
+
  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                                 unsigned long arg)
  {
@@ -851,6 +1112,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
         struct uffdio_register __user *user_uffdio_register;
         unsigned long vm_flags, new_flags;
         bool found;
+       bool non_anon_pages;
         unsigned long start, end, vma_end;
  
         user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -901,14 +1163,22 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
         if (vma->vm_start >= end)
                 goto out_unlock;
  
+       /*
+        * If the first vma contains huge pages, make sure start address
+        * is aligned to huge page size.
+        */
+       if (is_vm_hugetlb_page(vma)) {
+               unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
+
+               if (start & (vma_hpagesize - 1))
+                       goto out_unlock;
+       }
+
         /*
          * Search for not compatible vmas.
-        *
-        * FIXME: this shall be relaxed later so that it doesn't fail
-        * on tmpfs backed vmas (in addition to the current allowance
-        * on anonymous vmas).
          */
         found = false;
+       non_anon_pages = false;
         for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
                 cond_resched();
  
@@ -917,8 +1187,21 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
  
                 /* check not compatible vmas */
                 ret = -EINVAL;
-               if (!vma_is_anonymous(cur))
+               if (!vma_can_userfault(cur))
                         goto out_unlock;
+               /*
+                * If this vma contains ending address, and huge pages
+                * check alignment.
+                */
+               if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
+                   end > cur->vm_start) {
+                       unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
+
+                       ret = -EINVAL;
+
+                       if (end & (vma_hpagesize - 1))
+                               goto out_unlock;
+               }
  
                 /*
                  * Check that this vma isn't already owned by a
@@ -931,6 +1214,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                     cur->vm_userfaultfd_ctx.ctx != ctx)
                         goto out_unlock;
  
+               /*
+                * Note vmas containing huge pages
+                */
+               if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur))
+                       non_anon_pages = true;
+
                 found = true;
         }
         BUG_ON(!found);
@@ -942,7 +1231,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
         do {
                 cond_resched();
  
-               BUG_ON(!vma_is_anonymous(vma));
+               BUG_ON(!vma_can_userfault(vma));
                 BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
                        vma->vm_userfaultfd_ctx.ctx != ctx);
  
@@ -1000,7 +1289,8 @@ out_unlock:
                  * userland which ioctls methods are guaranteed to
                  * succeed on this range.
                  */
-               if (put_user(UFFD_API_RANGE_IOCTLS,
+               if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC :
+                            UFFD_API_RANGE_IOCTLS,
                              &user_uffdio_register->ioctls))
                         ret = -EFAULT;
         }
@@ -1046,12 +1336,19 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
         if (vma->vm_start >= end)
                 goto out_unlock;
  
+       /*
+        * If the first vma contains huge pages, make sure start address
+        * is aligned to huge page size.
+        */
+       if (is_vm_hugetlb_page(vma)) {
+               unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
+
+               if (start & (vma_hpagesize - 1))
+                       goto out_unlock;
+       }
+
         /*
          * Search for not compatible vmas.
-        *
-        * FIXME: this shall be relaxed later so that it doesn't fail
-        * on tmpfs backed vmas (in addition to the current allowance
-        * on anonymous vmas).
          */
         found = false;
         ret = -EINVAL;
@@ -1068,7 +1365,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
                  * provides for more strict behavior to notice
                  * unregistration errors.
                  */
-               if (!vma_is_anonymous(cur))
+               if (!vma_can_userfault(cur))
                         goto out_unlock;
  
                 found = true;
@@ -1082,7 +1379,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
         do {
                 cond_resched();
  
-               BUG_ON(!vma_is_anonymous(vma));
+               BUG_ON(!vma_can_userfault(vma));
  
                 /*
                  * Nothing to do: this vma is already registered into this
@@ -1095,6 +1392,19 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
                         start = vma->vm_start;
                 vma_end = min(end, vma->vm_end);
  
+               if (userfaultfd_missing(vma)) {
+                       /*
+                        * Wake any concurrent pending userfault while
+                        * we unregister, so they will not hang
+                        * permanently and it avoids userland to call
+                        * UFFDIO_WAKE explicitly.
+                        */
+                       struct userfaultfd_wake_range range;
+                       range.start = start;
+                       range.len = vma_end - start;
+                       wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
+               }
+
                 new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
                 prev = vma_merge(mm, prev, start, vma_end, new_flags,
                                  vma->anon_vma, vma->vm_file, vma->vm_pgoff,