]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - fs/dax.c
dax: Call ->iomap_begin without entry lock during dax fault
[karo-tx-linux.git] / fs / dax.c
index a8732fbed381a45bbce44fcdf0731ccfdc1a09ba..5c74f60d0a5094dc0a27f27ae0acd41667414332 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -451,16 +451,37 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
                __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
 }
 
+static int __dax_invalidate_mapping_entry(struct address_space *mapping,
+                                         pgoff_t index, bool trunc)
+{
+       int ret = 0;
+       void *entry;
+       struct radix_tree_root *page_tree = &mapping->page_tree;
+
+       spin_lock_irq(&mapping->tree_lock);
+       entry = get_unlocked_mapping_entry(mapping, index, NULL);
+       if (!entry || !radix_tree_exceptional_entry(entry))
+               goto out;
+       if (!trunc &&
+           (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
+            radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
+               goto out;
+       radix_tree_delete(page_tree, index);
+       mapping->nrexceptional--;
+       ret = 1;
+out:
+       put_unlocked_mapping_entry(mapping, index, entry);
+       spin_unlock_irq(&mapping->tree_lock);
+       return ret;
+}
 /*
  * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
  * entry to get unlocked before deleting it.
  */
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 {
-       void *entry;
+       int ret = __dax_invalidate_mapping_entry(mapping, index, true);
 
-       spin_lock_irq(&mapping->tree_lock);
-       entry = get_unlocked_mapping_entry(mapping, index, NULL);
        /*
         * This gets called from truncate / punch_hole path. As such, the caller
         * must hold locks protecting against concurrent modifications of the
@@ -468,16 +489,46 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
         * caller has seen exceptional entry for this index, we better find it
         * at that index as well...
         */
-       if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
-               spin_unlock_irq(&mapping->tree_lock);
-               return 0;
-       }
-       radix_tree_delete(&mapping->page_tree, index);
+       WARN_ON_ONCE(!ret);
+       return ret;
+}
+
+/*
+ * Invalidate exceptional DAX entry if easily possible. This handles DAX
+ * entries for invalidate_inode_pages() so we evict the entry only if we can
+ * do so without blocking.
+ */
+int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+       int ret = 0;
+       void *entry, **slot;
+       struct radix_tree_root *page_tree = &mapping->page_tree;
+
+       spin_lock_irq(&mapping->tree_lock);
+       entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
+       if (!entry || !radix_tree_exceptional_entry(entry) ||
+           slot_locked(mapping, slot))
+               goto out;
+       if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
+           radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+               goto out;
+       radix_tree_delete(page_tree, index);
        mapping->nrexceptional--;
+       ret = 1;
+out:
        spin_unlock_irq(&mapping->tree_lock);
-       dax_wake_mapping_entry_waiter(mapping, index, entry, true);
+       if (ret)
+               dax_wake_mapping_entry_waiter(mapping, index, entry, true);
+       return ret;
+}
 
-       return 1;
+/*
+ * Invalidate exceptional DAX entry if it is clean.
+ */
+int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
+                                     pgoff_t index)
+{
+       return __dax_invalidate_mapping_entry(mapping, index, false);
 }
 
 /*
@@ -488,15 +539,16 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
  * otherwise it will simply fall out of the page cache under memory
  * pressure without ever having been dirtied.
  */
-static int dax_load_hole(struct address_space *mapping, void *entry,
+static int dax_load_hole(struct address_space *mapping, void **entry,
                         struct vm_fault *vmf)
 {
        struct page *page;
+       int ret;
 
        /* Hole page already exists? Return it...  */
-       if (!radix_tree_exceptional_entry(entry)) {
-               vmf->page = entry;
-               return VM_FAULT_LOCKED;
+       if (!radix_tree_exceptional_entry(*entry)) {
+               page = *entry;
+               goto out;
        }
 
        /* This will replace locked radix tree entry with a hole page */
@@ -504,8 +556,17 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
                                   vmf->gfp_mask | __GFP_ZERO);
        if (!page)
                return VM_FAULT_OOM;
+ out:
        vmf->page = page;
-       return VM_FAULT_LOCKED;
+       ret = finish_fault(vmf);
+       vmf->page = NULL;
+       *entry = page;
+       if (!ret) {
+               /* Grab reference for PTE that is now referencing the page */
+               get_page(page);
+               return VM_FAULT_NOPAGE;
+       }
+       return ret;
 }
 
 static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
@@ -934,6 +995,17 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
        if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
                return -EIO;
 
+       /*
+        * Write can allocate block for an area which has a hole page mapped
+        * into page tables. We have to tear down these mappings so that data
+        * written by write(2) is visible in mmap.
+        */
+       if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
+               invalidate_inode_pages2_range(inode->i_mapping,
+                                             pos >> PAGE_SHIFT,
+                                             (end - 1) >> PAGE_SHIFT);
+       }
+
        while (pos < end) {
                unsigned offset = pos & (PAGE_SIZE - 1);
                struct blk_dax_ctl dax = { 0 };
@@ -992,23 +1064,6 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
        if (iov_iter_rw(iter) == WRITE)
                flags |= IOMAP_WRITE;
 
-       /*
-        * Yes, even DAX files can have page cache attached to them:  A zeroed
-        * page is inserted into the pagecache when we have to serve a write
-        * fault on a hole.  It should never be dirtied and can simply be
-        * dropped from the pagecache once we get real data for the page.
-        *
-        * XXX: This is racy against mmap, and there's nothing we can do about
-        * it. We'll eventually need to shift this down even further so that
-        * we can check if we allocated blocks over a hole first.
-        */
-       if (mapping->nrpages) {
-               ret = invalidate_inode_pages2_range(mapping,
-                               pos >> PAGE_SHIFT,
-                               (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
-               WARN_ON_ONCE(ret);
-       }
-
        while (iov_iter_count(iter)) {
                ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
                                iter, dax_iomap_actor);
@@ -1023,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 }
 EXPORT_SYMBOL_GPL(dax_iomap_rw);
 
+static int dax_fault_return(int error)
+{
+       if (error == 0)
+               return VM_FAULT_NOPAGE;
+       if (error == -ENOMEM)
+               return VM_FAULT_OOM;
+       return VM_FAULT_SIGBUS;
+}
+
 /**
  * dax_iomap_fault - handle a page fault on a DAX file
  * @vma: The virtual memory area where the fault occurred
@@ -1055,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
        if (pos >= i_size_read(inode))
                return VM_FAULT_SIGBUS;
 
-       entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
-       if (IS_ERR(entry)) {
-               error = PTR_ERR(entry);
-               goto out;
-       }
-
        if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
                flags |= IOMAP_WRITE;
 
@@ -1071,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
         */
        error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
        if (error)
-               goto unlock_entry;
+               return dax_fault_return(error);
        if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
-               error = -EIO;           /* fs corruption? */
+               vmf_ret = dax_fault_return(-EIO);       /* fs corruption? */
+               goto finish_iomap;
+       }
+
+       entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
+       if (IS_ERR(entry)) {
+               vmf_ret = dax_fault_return(PTR_ERR(entry));
                goto finish_iomap;
        }
 
@@ -1096,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                }
 
                if (error)
-                       goto finish_iomap;
+                       goto error_unlock_entry;
 
                __SetPageUptodate(vmf->cow_page);
                vmf_ret = finish_fault(vmf);
                if (!vmf_ret)
                        vmf_ret = VM_FAULT_DONE_COW;
-               goto finish_iomap;
+               goto unlock_entry;
        }
 
        switch (iomap.type) {
@@ -1114,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                }
                error = dax_insert_mapping(mapping, iomap.bdev, sector,
                                PAGE_SIZE, &entry, vma, vmf);
+               /* -EBUSY is fine, somebody else faulted on the same PTE */
+               if (error == -EBUSY)
+                       error = 0;
                break;
        case IOMAP_UNWRITTEN:
        case IOMAP_HOLE:
                if (!(vmf->flags & FAULT_FLAG_WRITE)) {
-                       vmf_ret = dax_load_hole(mapping, entry, vmf);
-                       break;
+                       vmf_ret = dax_load_hole(mapping, &entry, vmf);
+                       goto unlock_entry;
                }
                /*FALLTHRU*/
        default:
@@ -1128,31 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                break;
        }
 
+ error_unlock_entry:
+       vmf_ret = dax_fault_return(error) | major;
+ unlock_entry:
+       put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  finish_iomap:
        if (ops->iomap_end) {
-               if (error || (vmf_ret & VM_FAULT_ERROR)) {
-                       /* keep previous error */
-                       ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
-                                       &iomap);
-               } else {
-                       error = ops->iomap_end(inode, pos, PAGE_SIZE,
-                                       PAGE_SIZE, flags, &iomap);
-               }
-       }
- unlock_entry:
-       if (vmf_ret != VM_FAULT_LOCKED || error)
-               put_locked_mapping_entry(mapping, vmf->pgoff, entry);
- out:
-       if (error == -ENOMEM)
-               return VM_FAULT_OOM | major;
-       /* -EBUSY is fine, somebody else faulted on the same PTE */
-       if (error < 0 && error != -EBUSY)
-               return VM_FAULT_SIGBUS | major;
-       if (vmf_ret) {
-               WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
-               return vmf_ret;
+               int copied = PAGE_SIZE;
+
+               if (vmf_ret & VM_FAULT_ERROR)
+                       copied = 0;
+               /*
+                * The fault is done by now and there's no way back (other
+                * thread may be already happily using PTE we have installed).
+                * Just ignore error from ->iomap_end since we cannot do much
+                * with it.
+                */
+               ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
        }
-       return VM_FAULT_NOPAGE | major;
+       return vmf_ret;
 }
 EXPORT_SYMBOL_GPL(dax_iomap_fault);
 
@@ -1276,16 +1337,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
        if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
                goto fallback;
 
-       /*
-        * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
-        * PMD or a HZP entry.  If it can't (because a 4k page is already in
-        * the tree, for instance), it will return -EEXIST and we just fall
-        * back to 4k entries.
-        */
-       entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
-       if (IS_ERR(entry))
-               goto fallback;
-
        /*
         * Note that we don't use iomap_apply here.  We aren't doing I/O, only
         * setting up a mapping, so really we're using iomap_begin() as a way
@@ -1294,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
        pos = (loff_t)pgoff << PAGE_SHIFT;
        error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
        if (error)
-               goto unlock_entry;
+               goto fallback;
+
        if (iomap.offset + iomap.length < pos + PMD_SIZE)
                goto finish_iomap;
 
+       /*
+        * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
+        * PMD or a HZP entry.  If it can't (because a 4k page is already in
+        * the tree, for instance), it will return -EEXIST and we just fall
+        * back to 4k entries.
+        */
+       entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+       if (IS_ERR(entry))
+               goto finish_iomap;
+
        vmf.pgoff = pgoff;
        vmf.flags = flags;
        vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
@@ -1310,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
        case IOMAP_UNWRITTEN:
        case IOMAP_HOLE:
                if (WARN_ON_ONCE(write))
-                       goto finish_iomap;
+                       goto unlock_entry;
                result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
                                &entry);
                break;
@@ -1319,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                break;
        }
 
+ unlock_entry:
+       put_locked_mapping_entry(mapping, pgoff, entry);
  finish_iomap:
        if (ops->iomap_end) {
-               if (result == VM_FAULT_FALLBACK) {
-                       ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
-                                       &iomap);
-               } else {
-                       error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
-                                       iomap_flags, &iomap);
-                       if (error)
-                               result = VM_FAULT_FALLBACK;
-               }
+               int copied = PMD_SIZE;
+
+               if (result == VM_FAULT_FALLBACK)
+                       copied = 0;
+               /*
+                * The fault is done by now and there's no way back (other
+                * thread may be already happily using PMD we have installed).
+                * Just ignore error from ->iomap_end since we cannot do much
+                * with it.
+                */
+               ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
+                               &iomap);
        }
- unlock_entry:
-       put_locked_mapping_entry(mapping, pgoff, entry);
  fallback:
        if (result == VM_FAULT_FALLBACK) {
                split_huge_pmd(vma, pmd, address);