userfaultfd: shmem: use shmem_mcopy_atomic_pte for shared memory

[karo-tx-linux.git] / mm / userfaultfd.c
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c

index af817e5060fbfbda2be8ba35024c1ad460055b20..a0817cc470b0067001b5bb08016faa3248251693 100644 (file)
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -14,6 +14,9 @@
  #include <linux/swapops.h>
  #include <linux/userfaultfd_k.h>
  #include <linux/mmu_notifier.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/shmem_fs.h>
  #include <asm/tlbflush.h>
  #include "internal.h"
  
@@ -139,6 +142,198 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
         return pmd;
  }
  
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * __mcopy_atomic processing for HUGETLB vmas.  Note that this routine is
+ * called with mmap_sem held, it will release mmap_sem before returning.
+ */
+static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+                                             struct vm_area_struct *dst_vma,
+                                             unsigned long dst_start,
+                                             unsigned long src_start,
+                                             unsigned long len,
+                                             bool zeropage)
+{
+       ssize_t err;
+       pte_t *dst_pte;
+       unsigned long src_addr, dst_addr;
+       long copied;
+       struct page *page;
+       struct hstate *h;
+       unsigned long vma_hpagesize;
+       pgoff_t idx;
+       u32 hash;
+       struct address_space *mapping;
+
+       /*
+        * There is no default zero huge page for all huge page sizes as
+        * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
+        * by THP.  Since we can not reliably insert a zero page, this
+        * feature is not supported.
+        */
+       if (zeropage) {
+               up_read(&dst_mm->mmap_sem);
+               return -EINVAL;
+       }
+
+       src_addr = src_start;
+       dst_addr = dst_start;
+       copied = 0;
+       page = NULL;
+       vma_hpagesize = vma_kernel_pagesize(dst_vma);
+
+       /*
+        * Validate alignment based on huge page size
+        */
+       err = -EINVAL;
+       if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
+               goto out_unlock;
+
+retry:
+       /*
+        * On routine entry dst_vma is set.  If we had to drop mmap_sem and
+        * retry, dst_vma will be set to NULL and we must lookup again.
+        */
+       if (!dst_vma) {
+               err = -EINVAL;
+               dst_vma = find_vma(dst_mm, dst_start);
+               if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
+                       goto out_unlock;
+
+               if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
+                       goto out_unlock;
+
+               /*
+                * Make sure the vma is not shared, that the remaining dst
+                * range is both valid and fully within a single existing vma.
+                */
+               if (dst_vma->vm_flags & VM_SHARED)
+                       goto out_unlock;
+               if (dst_start < dst_vma->vm_start ||
+                   dst_start + len > dst_vma->vm_end)
+                       goto out_unlock;
+       }
+
+       if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
+                   (len - copied) & (vma_hpagesize - 1)))
+               goto out_unlock;
+
+       /*
+        * Only allow __mcopy_atomic_hugetlb on userfaultfd registered ranges.
+        */
+       if (!dst_vma->vm_userfaultfd_ctx.ctx)
+               goto out_unlock;
+
+       /*
+        * Ensure the dst_vma has a anon_vma.
+        */
+       err = -ENOMEM;
+       if (unlikely(anon_vma_prepare(dst_vma)))
+               goto out_unlock;
+
+       h = hstate_vma(dst_vma);
+
+       while (src_addr < src_start + len) {
+               pte_t dst_pteval;
+
+               BUG_ON(dst_addr >= dst_start + len);
+               VM_BUG_ON(dst_addr & ~huge_page_mask(h));
+
+               /*
+                * Serialize via hugetlb_fault_mutex
+                */
+               idx = linear_page_index(dst_vma, dst_addr);
+               mapping = dst_vma->vm_file->f_mapping;
+               hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
+                                                               idx, dst_addr);
+               mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+               err = -ENOMEM;
+               dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
+               if (!dst_pte) {
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       goto out_unlock;
+               }
+
+               err = -EEXIST;
+               dst_pteval = huge_ptep_get(dst_pte);
+               if (!huge_pte_none(dst_pteval)) {
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       goto out_unlock;
+               }
+
+               err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
+                                               dst_addr, src_addr, &page);
+
+               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+               cond_resched();
+
+               if (unlikely(err == -EFAULT)) {
+                       up_read(&dst_mm->mmap_sem);
+                       BUG_ON(!page);
+
+                       err = copy_huge_page_from_user(page,
+                                               (const void __user *)src_addr,
+                                               pages_per_huge_page(h), true);
+                       if (unlikely(err)) {
+                               err = -EFAULT;
+                               goto out;
+                       }
+                       down_read(&dst_mm->mmap_sem);
+
+                       dst_vma = NULL;
+                       goto retry;
+               } else
+                       BUG_ON(page);
+
+               if (!err) {
+                       dst_addr += vma_hpagesize;
+                       src_addr += vma_hpagesize;
+                       copied += vma_hpagesize;
+
+                       if (fatal_signal_pending(current))
+                               err = -EINTR;
+               }
+               if (err)
+                       break;
+       }
+
+out_unlock:
+       up_read(&dst_mm->mmap_sem);
+out:
+       if (page) {
+               /*
+                * We encountered an error and are about to free a newly
+                * allocated huge page.  It is possible that there was a
+                * reservation associated with the page that has been
+                * consumed.  See the routine restore_reserve_on_error
+                * for details.  Unfortunately, we can not call
+                * restore_reserve_on_error now as it would require holding
+                * mmap_sem.  Clear the PagePrivate flag so that the global
+                * reserve count will not be incremented in free_huge_page.
+                * The reservation map will still indicate the reservation
+                * was consumed and possibly prevent later page allocation.
+                * This is better than leaking a global reservation.
+                */
+               ClearPagePrivate(page);
+               put_page(page);
+       }
+       BUG_ON(copied < 0);
+       BUG_ON(err > 0);
+       BUG_ON(!copied && !err);
+       return copied ? copied : err;
+}
+#else /* !CONFIG_HUGETLB_PAGE */
+/* fail at build time if gcc attempts to use this */
+extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+                                     struct vm_area_struct *dst_vma,
+                                     unsigned long dst_start,
+                                     unsigned long src_start,
+                                     unsigned long len,
+                                     bool zeropage);
+#endif /* CONFIG_HUGETLB_PAGE */
+
  static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
                                               unsigned long dst_start,
                                               unsigned long src_start,
@@ -175,12 +370,21 @@ retry:
          */
         err = -EINVAL;
         dst_vma = find_vma(dst_mm, dst_start);
-       if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+       if (!dst_vma)
+               goto out_unlock;
+       if (!vma_is_shmem(dst_vma) && dst_vma->vm_flags & VM_SHARED)
                 goto out_unlock;
         if (dst_start < dst_vma->vm_start ||
             dst_start + len > dst_vma->vm_end)
                 goto out_unlock;
  
+       /*
+        * If this is a HUGETLB vma, pass off to appropriate routine
+        */
+       if (is_vm_hugetlb_page(dst_vma))
+               return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
+                                               src_start, len, zeropage);
+
         /*
          * Be strict and only allow __mcopy_atomic on userfaultfd
          * registered ranges to prevent userland errors going
@@ -193,11 +397,7 @@ retry:
         if (!dst_vma->vm_userfaultfd_ctx.ctx)
                 goto out_unlock;
  
-       /*
-        * FIXME: only allow copying on anonymous vmas, tmpfs should
-        * be added.
-        */
-       if (dst_vma->vm_ops)
+       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
                 goto out_unlock;
  
         /*
@@ -206,7 +406,7 @@ retry:
          * dst_vma.
          */
         err = -ENOMEM;
-       if (unlikely(anon_vma_prepare(dst_vma)))
+       if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma)))
                 goto out_unlock;
  
         while (src_addr < src_start + len) {
@@ -243,12 +443,21 @@ retry:
                 BUG_ON(pmd_none(*dst_pmd));
                 BUG_ON(pmd_trans_huge(*dst_pmd));
  
-               if (!zeropage)
-                       err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
-                                              dst_addr, src_addr, &page);
-               else
-                       err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
-                                                dst_addr);
+               if (vma_is_anonymous(dst_vma)) {
+                       if (!zeropage)
+                               err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+                                                      dst_addr, src_addr,
+                                                      &page);
+                       else
+                               err = mfill_zeropage_pte(dst_mm, dst_pmd,
+                                                        dst_vma, dst_addr);
+               } else {
+                       err = -EINVAL; /* if zeropage is true return -EINVAL */
+                       if (likely(!zeropage))
+                               err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
+                                                            dst_vma, dst_addr,
+                                                            src_addr, &page);
+               }
  
                 cond_resched();