]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
ocfs2: record UNWRITTEN extents when populate write desc
authorRyan Ding <ryan.ding@oracle.com>
Wed, 21 Oct 2015 22:02:45 +0000 (09:02 +1100)
committerStephen Rothwell <sfr@canb.auug.org.au>
Wed, 21 Oct 2015 22:02:45 +0000 (09:02 +1100)
To support direct io in ocfs2_write_begin_nolock & ocfs2_write_end_nolock.

There is still one issue in the direct write procedure.

phase 1: alloc extent with UNWRITTEN flag
phase 2: submit direct data to disk, add zero page to page cache
phase 3: clear UNWRITTEN flag when data has been written to disk

When there are 2 direct write A(0~3KB),B(4~7KB) writing to the same
cluster 0~7KB (cluster size 8KB).  Write request A arrive phase 2 first,
it will zero the region (4~7KB).  Before request A enter to phase 3,
request B arrive phase 2, it will zero region (0~3KB).  This is just like
request B steps request A.

To resolve this issue, we should let request B knows this cluster is already
under zero, to prevent it from steps the previous write request.

This patch will add function ocfs2_unwritten_check() to do this job.  It
will record all clusters that are under direct write(it will be recorded
in the 'ip_unwritten_list' member of inode info), and prevent the later
direct write writing to the same cluster to do the zero work again.

Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
fs/ocfs2/aops.c
fs/ocfs2/inode.c
fs/ocfs2/inode.h
fs/ocfs2/super.c

index 1f2f47bbd99aae42d171327848c52c9420649529..a8dc0966ab98f829e2d0470f86aacc8e636b89fc 100644 (file)
@@ -1195,6 +1195,13 @@ next_bh:
 
 #define OCFS2_MAX_CLUSTERS_PER_PAGE    (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
 
+struct ocfs2_unwritten_extent {
+       struct list_head        ue_node;
+       struct list_head        ue_ip_node;
+       u32                     ue_cpos;
+       u32                     ue_phys;
+};
+
 /*
  * Describe the state of a single cluster to be written to.
  */
@@ -1269,6 +1276,8 @@ struct ocfs2_write_ctxt {
        struct buffer_head              *w_di_bh;
 
        struct ocfs2_cached_dealloc_ctxt w_dealloc;
+
+       struct list_head                w_unwritten_list;
 };
 
 void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1307,8 +1316,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
        ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
 }
 
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_free_unwritten_list(struct inode *inode,
+                                struct list_head *head)
+{
+       struct ocfs2_inode_info *oi = OCFS2_I(inode);
+       struct ocfs2_unwritten_extent *dz = NULL, *tmp = NULL;
+
+       list_for_each_entry_safe(dz, tmp, head, ue_node) {
+               list_del(&dz->ue_node);
+               spin_lock(&oi->ip_lock);
+               list_del(&dz->ue_ip_node);
+               spin_unlock(&oi->ip_lock);
+               kfree(dz);
+       }
+}
+
+static void ocfs2_free_write_ctxt(struct inode *inode,
+                                 struct ocfs2_write_ctxt *wc)
 {
+       ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
        ocfs2_unlock_pages(wc);
        brelse(wc->w_di_bh);
        kfree(wc);
@@ -1340,6 +1366,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
                wc->w_large_pages = 0;
 
        ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+       INIT_LIST_HEAD(&wc->w_unwritten_list);
 
        *wcp = wc;
 
@@ -1789,6 +1816,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
        }
 }
 
+/*
+ * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
+ * do the zero work. And should not to clear UNWRITTEN since it will be cleared
+ * by the direct io procedure.
+ * If this is a new extent that allocated by direct io, we should mark it in
+ * the ip_unwritten_list.
+ */
+static int ocfs2_unwritten_check(struct inode *inode,
+                                struct ocfs2_write_ctxt *wc,
+                                struct ocfs2_write_cluster_desc *desc)
+{
+       struct ocfs2_inode_info *oi = OCFS2_I(inode);
+       struct ocfs2_unwritten_extent *dz = NULL, *new = NULL;
+       int ret = 0;
+
+       if (!desc->c_needs_zero)
+               return 0;
+
+retry:
+       spin_lock(&oi->ip_lock);
+       /* Needs not to zero no metter buffer or direct. The one who is zero
+        * the cluster is doing zero. And he will clear unwritten after all
+        * cluster io finished. */
+       list_for_each_entry(dz, &oi->ip_unwritten_list, ue_ip_node) {
+               if (desc->c_cpos == dz->ue_cpos) {
+                       BUG_ON(desc->c_new);
+                       desc->c_needs_zero = 0;
+                       desc->c_clear_unwritten = 0;
+                       goto unlock;
+               }
+       }
+
+       if (wc->w_type != OCFS2_WRITE_DIRECT)
+               goto unlock;
+
+       if (new == NULL) {
+               spin_unlock(&oi->ip_lock);
+               new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
+                            GFP_NOFS);
+               if (new == NULL) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               goto retry;
+       }
+       /* This direct write will doing zero. */
+       new->ue_cpos = desc->c_cpos;
+       new->ue_phys = desc->c_phys;
+       desc->c_clear_unwritten = 0;
+       list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
+       list_add_tail(&new->ue_node, &wc->w_unwritten_list);
+       new = NULL;
+unlock:
+       spin_unlock(&oi->ip_lock);
+out:
+       if (new)
+               kfree(new);
+       return ret;
+}
+
 /*
  * Populate each single-cluster write descriptor in the write context
  * with information about the i/o to be done.
@@ -1873,6 +1960,12 @@ static int ocfs2_populate_write_desc(struct inode *inode,
                        desc->c_needs_zero = 1;
                }
 
+               ret = ocfs2_unwritten_check(inode, wc, desc);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
                num_clusters--;
        }
 
@@ -2209,9 +2302,8 @@ try_again:
         * and non-sparse clusters we just extended.  For non-sparse writes,
         * we know zeros will only be needed in the first and/or last cluster.
         */
-       if (clusters_to_alloc || extents_to_split ||
-           (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
-                           wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+       if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+                          wc->w_desc[wc->w_clen - 1].c_needs_zero))
                cluster_of_pages = 1;
        else
                cluster_of_pages = 0;
@@ -2290,7 +2382,7 @@ out_commit:
        ocfs2_commit_trans(osb, handle);
 
 out:
-       ocfs2_free_write_ctxt(wc);
+       ocfs2_free_write_ctxt(inode, wc);
 
        if (data_ac) {
                ocfs2_free_alloc_context(data_ac);
@@ -2400,6 +2492,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
        handle_t *handle = wc->w_handle;
        struct page *tmppage;
 
+       BUG_ON(!list_empty(&wc->w_unwritten_list));
+
        if (handle) {
                ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
                                wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
index 8f87e05ee25d3824524c7f6e040a5f43d87c723d..0fd9ebdd3ed85cf8c754e7f110a29658eb7daa9f 100644 (file)
@@ -1125,6 +1125,9 @@ static void ocfs2_clear_inode(struct inode *inode)
        mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
                        "Clear inode of %llu, inode has io markers\n",
                        (unsigned long long)oi->ip_blkno);
+       mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list),
+                       "Clear inode of %llu, inode has unwritten extents\n",
+                       (unsigned long long)oi->ip_blkno);
 
        ocfs2_extent_map_trunc(inode, 0);
 
index aac8b86f312e41009ed673f2725b276691c66961..0c22ddd4b0dd8de3f9492dff2b1dc7eed6406159 100644 (file)
@@ -57,6 +57,9 @@ struct ocfs2_inode_info
        u32                             ip_flags; /* see below */
        u32                             ip_attr; /* inode attributes */
 
+       /* Record unwritten extents during direct io. */
+       struct list_head                ip_unwritten_list;
+
        /* protected by recovery_lock. */
        struct inode                    *ip_next_orphan;
 
index 2de4c8a9340c267a16381faacbd0ce66c18d123f..0b28d583b95104149fcb07a9066a75680ebfc99f 100644 (file)
@@ -1744,6 +1744,7 @@ static void ocfs2_inode_init_once(void *data)
        spin_lock_init(&oi->ip_lock);
        ocfs2_extent_map_init(&oi->vfs_inode);
        INIT_LIST_HEAD(&oi->ip_io_markers);
+       INIT_LIST_HEAD(&oi->ip_unwritten_list);
        oi->ip_dir_start_lookup = 0;
        mutex_init(&oi->ip_unaligned_aio);
        init_rwsem(&oi->ip_alloc_sem);