]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - fs/userfaultfd.c
sparc64: Add 64K page size support
[karo-tx-linux.git] / fs / userfaultfd.c
index ea9008254df46f3acb935876468b0c0e9c4e8df5..18406158e13fbf5e9b4e7041489f2d20e404c075 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/mempolicy.h>
 #include <linux/ioctl.h>
 #include <linux/security.h>
+#include <linux/hugetlb.h>
 
 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
 
@@ -201,6 +202,49 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
        return msg;
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Same functionality as userfaultfd_must_wait below with modifications for
+ * hugepmd ranges.
+ */
+static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+                                        unsigned long address,
+                                        unsigned long flags,
+                                        unsigned long reason)
+{
+       struct mm_struct *mm = ctx->mm;
+       pte_t *pte;
+       bool ret = true;
+
+       VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+       pte = huge_pte_offset(mm, address);
+       if (!pte)
+               goto out;
+
+       ret = false;
+
+       /*
+        * Lockless access: we're in a wait_event so it's ok if it
+        * changes under us.
+        */
+       if (huge_pte_none(*pte))
+               ret = true;
+       if (!huge_pte_write(*pte) && (reason & VM_UFFD_WP))
+               ret = true;
+out:
+       return ret;
+}
+#else
+static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+                                        unsigned long address,
+                                        unsigned long flags,
+                                        unsigned long reason)
+{
+       return false;   /* should never get here */
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
 /*
  * Verify the pagetables are still not ok after having reigstered into
  * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
@@ -377,8 +421,12 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
        set_current_state(blocking_state);
        spin_unlock(&ctx->fault_pending_wqh.lock);
 
-       must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
-                                         reason);
+       if (!is_vm_hugetlb_page(vmf->vma))
+               must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
+                                                 reason);
+       else
+               must_wait = userfaultfd_huge_must_wait(ctx, vmf->address,
+                                                      vmf->flags, reason);
        up_read(&mm->mmap_sem);
 
        if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
@@ -1048,6 +1096,12 @@ static __always_inline int validate_range(struct mm_struct *mm,
        return 0;
 }
 
+static inline bool vma_can_userfault(struct vm_area_struct *vma)
+{
+       return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
+               vma_is_shmem(vma);
+}
+
 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                                unsigned long arg)
 {
@@ -1058,6 +1112,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
        struct uffdio_register __user *user_uffdio_register;
        unsigned long vm_flags, new_flags;
        bool found;
+       bool non_anon_pages;
        unsigned long start, end, vma_end;
 
        user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1108,14 +1163,22 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
        if (vma->vm_start >= end)
                goto out_unlock;
 
+       /*
+        * If the first vma contains huge pages, make sure start address
+        * is aligned to huge page size.
+        */
+       if (is_vm_hugetlb_page(vma)) {
+               unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
+
+               if (start & (vma_hpagesize - 1))
+                       goto out_unlock;
+       }
+
        /*
         * Search for not compatible vmas.
-        *
-        * FIXME: this shall be relaxed later so that it doesn't fail
-        * on tmpfs backed vmas (in addition to the current allowance
-        * on anonymous vmas).
         */
        found = false;
+       non_anon_pages = false;
        for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
                cond_resched();
 
@@ -1124,8 +1187,21 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
                /* check not compatible vmas */
                ret = -EINVAL;
-               if (!vma_is_anonymous(cur))
+               if (!vma_can_userfault(cur))
                        goto out_unlock;
+               /*
+                * If this vma contains ending address, and huge pages
+                * check alignment.
+                */
+               if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
+                   end > cur->vm_start) {
+                       unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
+
+                       ret = -EINVAL;
+
+                       if (end & (vma_hpagesize - 1))
+                               goto out_unlock;
+               }
 
                /*
                 * Check that this vma isn't already owned by a
@@ -1138,6 +1214,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                    cur->vm_userfaultfd_ctx.ctx != ctx)
                        goto out_unlock;
 
+               /*
+                * Note vmas containing huge pages
+                */
+               if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur))
+                       non_anon_pages = true;
+
                found = true;
        }
        BUG_ON(!found);
@@ -1149,7 +1231,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
        do {
                cond_resched();
 
-               BUG_ON(!vma_is_anonymous(vma));
+               BUG_ON(!vma_can_userfault(vma));
                BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
                       vma->vm_userfaultfd_ctx.ctx != ctx);
 
@@ -1207,7 +1289,8 @@ out_unlock:
                 * userland which ioctls methods are guaranteed to
                 * succeed on this range.
                 */
-               if (put_user(UFFD_API_RANGE_IOCTLS,
+               if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC :
+                            UFFD_API_RANGE_IOCTLS,
                             &user_uffdio_register->ioctls))
                        ret = -EFAULT;
        }
@@ -1253,12 +1336,19 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
        if (vma->vm_start >= end)
                goto out_unlock;
 
+       /*
+        * If the first vma contains huge pages, make sure start address
+        * is aligned to huge page size.
+        */
+       if (is_vm_hugetlb_page(vma)) {
+               unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
+
+               if (start & (vma_hpagesize - 1))
+                       goto out_unlock;
+       }
+
        /*
         * Search for not compatible vmas.
-        *
-        * FIXME: this shall be relaxed later so that it doesn't fail
-        * on tmpfs backed vmas (in addition to the current allowance
-        * on anonymous vmas).
         */
        found = false;
        ret = -EINVAL;
@@ -1275,7 +1365,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
                 * provides for more strict behavior to notice
                 * unregistration errors.
                 */
-               if (!vma_is_anonymous(cur))
+               if (!vma_can_userfault(cur))
                        goto out_unlock;
 
                found = true;
@@ -1289,7 +1379,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
        do {
                cond_resched();
 
-               BUG_ON(!vma_is_anonymous(vma));
+               BUG_ON(!vma_can_userfault(vma));
 
                /*
                 * Nothing to do: this vma is already registered into this
@@ -1302,6 +1392,19 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
                        start = vma->vm_start;
                vma_end = min(end, vma->vm_end);
 
+               if (userfaultfd_missing(vma)) {
+                       /*
+                        * Wake any concurrent pending userfault while
+                        * we unregister, so they will not hang
+                        * permanently and it avoids userland to call
+                        * UFFDIO_WAKE explicitly.
+                        */
+                       struct userfaultfd_wake_range range;
+                       range.start = start;
+                       range.len = vma_end - start;
+                       wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
+               }
+
                new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
                prev = vma_merge(mm, prev, start, vma_end, new_flags,
                                 vma->anon_vma, vma->vm_file, vma->vm_pgoff,