Merge branch 'cow_readahead' of git://oss.oracle.com/git/tma/linux-2.6 into merge-2

author Joel Becker <joel.becker@oracle.com>

Fri, 10 Sep 2010 15:41:04 +0000 (08:41 -0700)

committer Joel Becker <joel.becker@oracle.com>

Fri, 10 Sep 2010 15:41:04 +0000 (08:41 -0700)
author Joel Becker <joel.becker@oracle.com>
Fri, 10 Sep 2010 15:41:04 +0000 (08:41 -0700)
committer Joel Becker <joel.becker@oracle.com>
Fri, 10 Sep 2010 15:41:04 +0000 (08:41 -0700)
diff --combined fs/ocfs2/aops.c

index f477f18b35d5be2844fc74b602ffd33bdf6efb97,7155c5a919d722f05faadc117de698d8592fe610..5cfeee11815881b04a3250d42a72838e40289200
--- 1/fs/ocfs2/aops.c
--- 2/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@@ -883,8 -883,8 +883,8 @@@ struct ocfs2_write_ctxt 
          * out in so that future reads from that region will get
          * zero's.
          */
- -      struct page                     *w_pages[OCFS2_MAX_CTXT_PAGES];
         unsigned int                    w_num_pages;
+ +      struct page                     *w_pages[OCFS2_MAX_CTXT_PAGES];
         struct page                     *w_target_page;
   
         /*
@@@ -1642,7 -1642,8 +1642,8 @@@ static int ocfs2_zero_tail(struct inod
         return ret;
   }
   
- int ocfs2_write_begin_nolock(struct address_space *mapping,
+ int ocfs2_write_begin_nolock(struct file *filp,
+                            struct address_space *mapping,
                              loff_t pos, unsigned len, unsigned flags,
                              struct page **pagep, void **fsdata,
                              struct buffer_head *di_bh, struct page *mmap_page)
@@@ -1692,7 -1693,7 +1693,7 @@@
                 mlog_errno(ret);
                 goto out;
         } else if (ret == 1) {
-               ret = ocfs2_refcount_cow(inode, di_bh,
+               ret = ocfs2_refcount_cow(inode, filp, di_bh,
                                          wc->w_cpos, wc->w_clen, UINT_MAX);
                 if (ret) {
                         mlog_errno(ret);
@@@ -1854,7 -1855,7 +1855,7 @@@ static int ocfs2_write_begin(struct fil
          */
         down_write(&OCFS2_I(inode)->ip_alloc_sem);
   
-       ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+       ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
                                        fsdata, di_bh, NULL);
         if (ret) {
                 mlog_errno(ret);
diff --combined fs/ocfs2/file.c

index b03f6601fd71b523b39f333d26b16b99bb7076c9,4331f57e9fde58703f3bd260b8b002094996556f..9a74542e1a057904ef1108a5628f9b808f0847a1
--- 1/fs/ocfs2/file.c
--- 2/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@@ -36,7 -36,6 +36,7 @@@
   #include <linux/writeback.h>
   #include <linux/falloc.h>
   #include <linux/quotaops.h>
+ +#include <linux/blkdev.h>
   
   #define MLOG_MASK_PREFIX ML_INODE
   #include <cluster/masklog.h>
@@@ -64,6 -63,12 +64,6 @@@
   
   #include "buffer_head_io.h"
   
- -static int ocfs2_sync_inode(struct inode *inode)
- -{
- -      filemap_fdatawrite(inode->i_mapping);
- -      return sync_mapping_buffers(inode->i_mapping);
- -}
- -
   static int ocfs2_init_file_private(struct inode *inode, struct file *file)
   {
         struct ocfs2_file_private *fp;
@@@ -181,16 -186,12 +181,16 @@@ static int ocfs2_sync_file(struct file 
         mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
                    dentry->d_name.len, dentry->d_name.name);
   
- -      err = ocfs2_sync_inode(dentry->d_inode);
- -      if (err)
- -              goto bail;
- -
- -      if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ +      if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
+ +              /*
+ +               * We still have to flush drive's caches to get data to the
+ +               * platter
+ +               */
+ +              if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
+ +                      blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
+ +                                         NULL, BLKDEV_IFL_WAIT);
                 goto bail;
+ +      }
   
         journal = osb->journal->j_journal;
         err = jbd2_journal_force_commit(journal);
@@@ -360,7 -361,7 +360,7 @@@ static int ocfs2_cow_file_pos(struct in
         if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
                 goto out;
   
-       return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
+       return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
   
   out:
         return status;
@@@ -773,7 -774,7 +773,7 @@@ static int ocfs2_write_zero_page(struc
         BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
         BUG_ON(abs_from & (inode->i_blkbits - 1));
   
- -      page = grab_cache_page(mapping, index);
+ +      page = find_or_create_page(mapping, index, GFP_NOFS);
         if (!page) {
                 ret = -ENOMEM;
                 mlog_errno(ret);
@@@ -903,8 -904,8 +903,8 @@@ static int ocfs2_zero_extend_get_range(
                 zero_clusters = last_cpos - zero_cpos;
   
         if (needs_cow) {
-               rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
-                                       UINT_MAX);
+               rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
+                                       zero_clusters, UINT_MAX);
                 if (rc) {
                         mlog_errno(rc);
                         goto out;
@@@ -2052,6 -2053,7 +2052,7 @@@ out
   }
   
   static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+                                           struct file *file,
                                             loff_t pos, size_t count,
                                             int *meta_level)
   {
@@@ -2069,7 -2071,7 +2070,7 @@@
   
         *meta_level = 1;
   
-       ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
+       ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
         if (ret)
                 mlog_errno(ret);
   out:
@@@ -2077,7 -2079,7 +2078,7 @@@
         return ret;
   }
   
- static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
+ static int ocfs2_prepare_inode_for_write(struct file *file,
                                          loff_t *ppos,
                                          size_t count,
                                          int appending,
@@@ -2085,6 -2087,7 +2086,7 @@@
                                          int *has_refcount)
   {
         int ret = 0, meta_level = 0;
+       struct dentry *dentry = file->f_path.dentry;
         struct inode *inode = dentry->d_inode;
         loff_t saved_pos, end;
   
@@@ -2140,6 -2143,7 +2142,7 @@@
                         meta_level = -1;
   
                         ret = ocfs2_prepare_inode_for_refcount(inode,
+                                                              file,
                                                                saved_pos,
                                                                count,
                                                                &meta_level);
@@@ -2254,7 -2258,7 +2257,7 @@@ relock
         }
   
         can_do_direct = direct_io;
-       ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+       ret = ocfs2_prepare_inode_for_write(file, ppos,
                                             iocb->ki_left, appending,
                                             &can_do_direct, &has_refcount);
         if (ret < 0) {
@@@ -2302,6 -2306,17 +2305,6 @@@
                 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
                                                     ppos, count, ocount);
                 if (written < 0) {
- -                      /*
- -                       * direct write may have instantiated a few
- -                       * blocks outside i_size. Trim these off again.
- -                       * Don't need i_size_read because we hold i_mutex.
- -                       *
- -                       * XXX(truncate): this looks buggy because ocfs2 did not
- -                       * actually implement ->truncate.  Take a look at
- -                       * the new truncate sequence and update this accordingly
- -                       */
- -                      if (*ppos + count > inode->i_size)
- -                              truncate_setsize(inode, inode->i_size);
                         ret = written;
                         goto out_dio;
                 }
@@@ -2317,7 -2332,7 +2320,7 @@@ out_dio
         BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
   
         if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
- -          ((file->f_flags & O_DIRECT) && has_refcount)) {
+ +          ((file->f_flags & O_DIRECT) && !direct_io)) {
                 ret = filemap_fdatawrite_range(file->f_mapping, pos,
                                                pos + count - 1);
                 if (ret < 0)
@@@ -2373,7 -2388,7 +2376,7 @@@ static int ocfs2_splice_to_file(struct 
   {
         int ret;
   
-       ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
+       ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
                                             sd->total_len, 0, NULL, NULL);
         if (ret < 0) {
                 mlog_errno(ret);
diff --combined fs/ocfs2/mmap.c

index 4c18f4ad93b43cae6e5ddcd5a2781fbc79292484,b04d6961c0d4d29dfd1394496fea37665746dcd4..7e32db9c2c993b38e67f5d1282525f0a089b54b0
--- 1/fs/ocfs2/mmap.c
--- 2/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@@ -59,10 -59,11 +59,11 @@@ static int ocfs2_fault(struct vm_area_s
         return ret;
   }
   
- static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
+ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
                                 struct page *page)
   {
         int ret;
+       struct inode *inode = file->f_path.dentry->d_inode;
         struct address_space *mapping = inode->i_mapping;
         loff_t pos = page_offset(page);
         unsigned int len = PAGE_CACHE_SIZE;
@@@ -74,11 -75,9 +75,11 @@@
         /*
          * Another node might have truncated while we were waiting on
          * cluster locks.
+ +       * We don't check size == 0 before the shift. This is borrowed
+ +       * from do_generic_file_read.
          */
- -      last_index = size >> PAGE_CACHE_SHIFT;
- -      if (page->index > last_index) {
+ +      last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+ +      if (unlikely(!size || page->index > last_index)) {
                 ret = -EINVAL;
                 goto out;
         }
@@@ -109,9 -108,9 +110,9 @@@
          * because the "write" would invalidate their data.
          */
         if (page->index == last_index)
- -              len = size & ~PAGE_CACHE_MASK;
+ +              len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
   
-       ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+       ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
                                        &fsdata, di_bh, page);
         if (ret) {
                 if (ret != -ENOSPC)
@@@ -159,7 -158,7 +160,7 @@@ static int ocfs2_page_mkwrite(struct vm
          */
         down_write(&OCFS2_I(inode)->ip_alloc_sem);
   
-       ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+       ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
   
         up_write(&OCFS2_I(inode)->ip_alloc_sem);
   
diff --combined fs/ocfs2/refcounttree.c

index 0afeda83120fa0e54bd2c9cc7e1f4f08c6e836bf,47549f64224cee4836db2f4f9be81e13fedee5fa..a120cfcf69bfd842cf4d50f18ecef025389ff3be
--- 1/fs/ocfs2/refcounttree.c
--- 2/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@@ -49,6 -49,7 +49,7 @@@
   
   struct ocfs2_cow_context {
         struct inode *inode;
+       struct file *file;
         u32 cow_start;
         u32 cow_len;
         struct ocfs2_extent_tree data_et;
@@@ -2436,26 -2437,16 +2437,26 @@@ static int ocfs2_calc_refcount_meta_cre
                 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
                           le32_to_cpu(rec.r_clusters)) - cpos;
                 /*
- -               * If the refcount rec already exist, cool. We just need
- -               * to check whether there is a split. Otherwise we just need
- -               * to increase the refcount.
- -               * If we will insert one, increases recs_add.
- -               *
                  * We record all the records which will be inserted to the
                  * same refcount block, so that we can tell exactly whether
                  * we need a new refcount block or not.
+ +               *
+ +               * If we will insert a new one, this is easy and only happens
+ +               * during adding refcounted flag to the extent, so we don't
+ +               * have a chance of spliting. We just need one record.
+ +               *
+ +               * If the refcount rec already exists, that would be a little
+ +               * complicated. we may have to:
+ +               * 1) split at the beginning if the start pos isn't aligned.
+ +               *    we need 1 more record in this case.
+ +               * 2) split int the end if the end pos isn't aligned.
+ +               *    we need 1 more record in this case.
+ +               * 3) split in the middle because of file system fragmentation.
+ +               *    we need 2 more records in this case(we can't detect this
+ +               *    beforehand, so always think of the worst case).
                  */
                 if (rec.r_refcount) {
+ +                      recs_add += 2;
                         /* Check whether we need a split at the beginning. */
                         if (cpos == start_cpos &&
                             cpos != le64_to_cpu(rec.r_cpos))
@@@ -2932,13 -2923,16 +2933,16 @@@ static int ocfs2_duplicate_clusters_by_
         u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
         struct page *page;
         pgoff_t page_index;
-       unsigned int from, to;
+       unsigned int from, to, readahead_pages;
         loff_t offset, end, map_end;
         struct address_space *mapping = context->inode->i_mapping;
   
         mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
              new_cluster, new_len, cpos);
   
+       readahead_pages =
+               (ocfs2_cow_contig_clusters(sb) <<
+                OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
         offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
         end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
         /*
@@@ -2960,7 -2954,7 +2964,7 @@@
                 if (map_end & (PAGE_CACHE_SIZE - 1))
                         to = map_end & (PAGE_CACHE_SIZE - 1);
   
- -              page = grab_cache_page(mapping, page_index);
+ +              page = find_or_create_page(mapping, page_index, GFP_NOFS);
   
                 /*
                  * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
@@@ -2969,6 -2963,14 +2973,14 @@@
                 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
                         BUG_ON(PageDirty(page));
   
+               if (PageReadahead(page) && context->file) {
+                       page_cache_async_readahead(mapping,
+                                                  &context->file->f_ra,
+                                                  context->file,
+                                                  page, page_index,
+                                                  readahead_pages);
+               }
+ 
                 if (!PageUptodate(page)) {
                         ret = block_read_full_page(page, ocfs2_get_block);
                         if (ret) {
@@@ -3179,8 -3181,7 +3191,8 @@@ static int ocfs2_cow_sync_writeback(str
                 if (map_end > end)
                         map_end = end;
   
- -              page = grab_cache_page(context->inode->i_mapping, page_index);
+ +              page = find_or_create_page(context->inode->i_mapping,
+ +                                         page_index, GFP_NOFS);
                 BUG_ON(!page);
   
                 wait_on_page_writeback(page);
@@@ -3409,12 -3410,35 +3421,35 @@@ static int ocfs2_replace_cow(struct ocf
         return ret;
   }
   
+ static void ocfs2_readahead_for_cow(struct inode *inode,
+                                   struct file *file,
+                                   u32 start, u32 len)
+ {
+       struct address_space *mapping;
+       pgoff_t index;
+       unsigned long num_pages;
+       int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ 
+       if (!file)
+               return;
+ 
+       mapping = file->f_mapping;
+       num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
+       if (!num_pages)
+               num_pages = 1;
+ 
+       index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
+       page_cache_sync_readahead(mapping, &file->f_ra, file,
+                                 index, num_pages);
+ }
+ 
   /*
    * Starting at cpos, try to CoW write_len clusters.  Don't CoW
    * past max_cpos.  This will stop when it runs into a hole or an
    * unrefcounted extent.
    */
   static int ocfs2_refcount_cow_hunk(struct inode *inode,
+                                  struct file *file,
                                    struct buffer_head *di_bh,
                                    u32 cpos, u32 write_len, u32 max_cpos)
   {
@@@ -3443,6 -3467,8 +3478,8 @@@
   
         BUG_ON(cow_len == 0);
   
+       ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
+ 
         context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
         if (!context) {
                 ret = -ENOMEM;
@@@ -3464,6 -3490,7 +3501,7 @@@
         context->ref_root_bh = ref_root_bh;
         context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
         context->get_clusters = ocfs2_di_get_clusters;
+       context->file = file;
   
         ocfs2_init_dinode_extent_tree(&context->data_et,
                                       INODE_CACHE(inode), di_bh);
@@@ -3492,6 -3519,7 +3530,7 @@@ out
    * clusters between cpos and cpos+write_len are safe to modify.
    */
   int ocfs2_refcount_cow(struct inode *inode,
+                      struct file *file,
                        struct buffer_head *di_bh,
                        u32 cpos, u32 write_len, u32 max_cpos)
   {
@@@ -3511,7 -3539,7 +3550,7 @@@
                         num_clusters = write_len;
   
                 if (ext_flags & OCFS2_EXT_REFCOUNTED) {
-                       ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
+                       ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
                                                       num_clusters, max_cpos);
                         if (ret) {
                                 mlog_errno(ret);
diff --combined fs/ocfs2/refcounttree.h

index f04892d6175d8a8d8cd702e57526c01bd0896de6,29cba0eaa92743a98da2a095a5ff234edd98e7c6..c8ce46f7d8e30ee842cc8966a8c034aefae3b98b
--- 1/fs/ocfs2/refcounttree.h
--- 2/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@@ -21,14 -21,14 +21,14 @@@ struct ocfs2_refcount_tree 
         struct rb_node rf_node;
         u64 rf_blkno;
         u32 rf_generation;
+ +      struct kref rf_getcnt;
         struct rw_semaphore rf_sem;
         struct ocfs2_lock_res rf_lockres;
- -      struct kref rf_getcnt;
         int rf_removed;
   
         /* the following 4 fields are used by caching_info. */
- -      struct ocfs2_caching_info rf_ci;
         spinlock_t rf_lock;
+ +      struct ocfs2_caching_info rf_ci;
         struct mutex rf_io_mutex;
         struct super_block *rf_sb;
   };
@@@ -52,7 -52,8 +52,8 @@@ int ocfs2_prepare_refcount_change_for_d
                                           u32 clusters,
                                           int *credits,
                                           int *ref_blocks);
- int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
+ int ocfs2_refcount_cow(struct inode *inode,
+                      struct file *filep, struct buffer_head *di_bh,
                        u32 cpos, u32 write_len, u32 max_cpos);
   
   typedef int (ocfs2_post_refcount_func)(struct inode *inode,
author	Joel Becker <joel.becker@oracle.com>
	Fri, 10 Sep 2010 15:41:04 +0000 (08:41 -0700)
committer	Joel Becker <joel.becker@oracle.com>
	Fri, 10 Sep 2010 15:41:04 +0000 (08:41 -0700)
		1	2
fs/ocfs2/aops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/mmap.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/refcounttree.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/refcounttree.h	patch \|	diff1 \|	diff2 \|	blob \| history