]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - fs/nfs/pnfs.c
Merge branch 'for-linus' of git://git.kernel.dk/linux-block
[karo-tx-linux.git] / fs / nfs / pnfs.c
index 2e00feacd4bee642c59c965612068653f120756c..fe624c91bd006ef11f839f61cab54f833286edfa 100644 (file)
@@ -35,6 +35,7 @@
 #include "iostat.h"
 
 #define NFSDBG_FACILITY                NFSDBG_PNFS
+#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
 
 /* Locking:
  *
@@ -190,7 +191,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
 
 /* Need to hold i_lock if caller does not already hold reference */
 void
-get_layout_hdr(struct pnfs_layout_hdr *lo)
+pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
 {
        atomic_inc(&lo->plh_refcount);
 }
@@ -199,43 +200,107 @@ static struct pnfs_layout_hdr *
 pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
 {
        struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
-       return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) :
-               kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
+       return ld->alloc_layout_hdr(ino, gfp_flags);
 }
 
 static void
 pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
 {
-       struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
+       struct nfs_server *server = NFS_SERVER(lo->plh_inode);
+       struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
+
+       if (!list_empty(&lo->plh_layouts)) {
+               struct nfs_client *clp = server->nfs_client;
+
+               spin_lock(&clp->cl_lock);
+               list_del_init(&lo->plh_layouts);
+               spin_unlock(&clp->cl_lock);
+       }
        put_rpccred(lo->plh_lc_cred);
-       return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
+       return ld->free_layout_hdr(lo);
 }
 
 static void
-destroy_layout_hdr(struct pnfs_layout_hdr *lo)
+pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
 {
+       struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
        dprintk("%s: freeing layout cache %p\n", __func__, lo);
-       BUG_ON(!list_empty(&lo->plh_layouts));
-       NFS_I(lo->plh_inode)->layout = NULL;
-       pnfs_free_layout_hdr(lo);
+       nfsi->layout = NULL;
+       /* Reset MDS Threshold I/O counters */
+       nfsi->write_io = 0;
+       nfsi->read_io = 0;
+}
+
+void
+pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+       struct inode *inode = lo->plh_inode;
+
+       if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+               pnfs_detach_layout_hdr(lo);
+               spin_unlock(&inode->i_lock);
+               pnfs_free_layout_hdr(lo);
+       }
+}
+
+static int
+pnfs_iomode_to_fail_bit(u32 iomode)
+{
+       return iomode == IOMODE_RW ?
+               NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
 }
 
 static void
-put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
 {
-       if (atomic_dec_and_test(&lo->plh_refcount))
-               destroy_layout_hdr(lo);
+       lo->plh_retry_timestamp = jiffies;
+       if (test_and_set_bit(fail_bit, &lo->plh_flags))
+               atomic_inc(&lo->plh_refcount);
 }
 
-void
-put_layout_hdr(struct pnfs_layout_hdr *lo)
+static void
+pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
+{
+       if (test_and_clear_bit(fail_bit, &lo->plh_flags))
+               atomic_dec(&lo->plh_refcount);
+}
+
+static void
+pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
 {
        struct inode *inode = lo->plh_inode;
+       struct pnfs_layout_range range = {
+               .iomode = iomode,
+               .offset = 0,
+               .length = NFS4_MAX_UINT64,
+       };
+       LIST_HEAD(head);
 
-       if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
-               destroy_layout_hdr(lo);
-               spin_unlock(&inode->i_lock);
+       spin_lock(&inode->i_lock);
+       pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
+       pnfs_mark_matching_lsegs_invalid(lo, &head, &range);
+       spin_unlock(&inode->i_lock);
+       pnfs_free_lseg_list(&head);
+       dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
+                       iomode == IOMODE_RW ?  "RW" : "READ");
+}
+
+static bool
+pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
+{
+       unsigned long start, end;
+       int fail_bit = pnfs_iomode_to_fail_bit(iomode);
+
+       if (test_bit(fail_bit, &lo->plh_flags) == 0)
+               return false;
+       end = jiffies;
+       start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
+       if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
+               /* It is time to retry the failed layoutgets */
+               pnfs_layout_clear_fail_bit(lo, fail_bit);
+               return false;
        }
+       return true;
 }
 
 static void
@@ -249,33 +314,32 @@ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
        lseg->pls_layout = lo;
 }
 
-static void free_lseg(struct pnfs_layout_segment *lseg)
+static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
 {
        struct inode *ino = lseg->pls_layout->plh_inode;
 
        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
-       /* Matched by get_layout_hdr in pnfs_insert_layout */
-       put_layout_hdr(NFS_I(ino)->layout);
 }
 
 static void
-put_lseg_common(struct pnfs_layout_segment *lseg)
+pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
+               struct pnfs_layout_segment *lseg)
 {
-       struct inode *inode = lseg->pls_layout->plh_inode;
+       struct inode *inode = lo->plh_inode;
 
        WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
        list_del_init(&lseg->pls_list);
-       if (list_empty(&lseg->pls_layout->plh_segs)) {
-               set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
-               /* Matched by initial refcount set in alloc_init_layout_hdr */
-               put_layout_hdr_locked(lseg->pls_layout);
-       }
+       /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
+       atomic_dec(&lo->plh_refcount);
+       if (list_empty(&lo->plh_segs))
+               clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
        rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
 }
 
 void
-put_lseg(struct pnfs_layout_segment *lseg)
+pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 {
+       struct pnfs_layout_hdr *lo;
        struct inode *inode;
 
        if (!lseg)
@@ -284,17 +348,17 @@ put_lseg(struct pnfs_layout_segment *lseg)
        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
                atomic_read(&lseg->pls_refcount),
                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
-       inode = lseg->pls_layout->plh_inode;
+       lo = lseg->pls_layout;
+       inode = lo->plh_inode;
        if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
-               LIST_HEAD(free_me);
-
-               put_lseg_common(lseg);
-               list_add(&lseg->pls_list, &free_me);
+               pnfs_get_layout_hdr(lo);
+               pnfs_layout_remove_lseg(lo, lseg);
                spin_unlock(&inode->i_lock);
-               pnfs_free_lseg_list(&free_me);
+               pnfs_free_lseg(lseg);
+               pnfs_put_layout_hdr(lo);
        }
 }
-EXPORT_SYMBOL_GPL(put_lseg);
+EXPORT_SYMBOL_GPL(pnfs_put_lseg);
 
 static inline u64
 end_offset(u64 start, u64 len)
@@ -378,7 +442,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
                dprintk("%s: lseg %p ref %d\n", __func__, lseg,
                        atomic_read(&lseg->pls_refcount));
                if (atomic_dec_and_test(&lseg->pls_refcount)) {
-                       put_lseg_common(lseg);
+                       pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
                        list_add(&lseg->pls_list, tmp_list);
                        rv = 1;
                }
@@ -390,7 +454,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
  * after call.
  */
 int
-mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
+pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                            struct list_head *tmp_list,
                            struct pnfs_layout_range *recall_range)
 {
@@ -399,14 +463,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 
        dprintk("%s:Begin lo %p\n", __func__, lo);
 
-       if (list_empty(&lo->plh_segs)) {
-               /* Reset MDS Threshold I/O counters */
-               NFS_I(lo->plh_inode)->write_io = 0;
-               NFS_I(lo->plh_inode)->read_io = 0;
-               if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
-                       put_layout_hdr_locked(lo);
+       if (list_empty(&lo->plh_segs))
                return 0;
-       }
        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
                if (!recall_range ||
                    should_free_lseg(&lseg->pls_range, recall_range)) {
@@ -426,25 +484,13 @@ void
 pnfs_free_lseg_list(struct list_head *free_me)
 {
        struct pnfs_layout_segment *lseg, *tmp;
-       struct pnfs_layout_hdr *lo;
 
        if (list_empty(free_me))
                return;
 
-       lo = list_first_entry(free_me, struct pnfs_layout_segment,
-                             pls_list)->pls_layout;
-
-       if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
-               struct nfs_client *clp;
-
-               clp = NFS_SERVER(lo->plh_inode)->nfs_client;
-               spin_lock(&clp->cl_lock);
-               list_del_init(&lo->plh_layouts);
-               spin_unlock(&clp->cl_lock);
-       }
        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
                list_del(&lseg->pls_list);
-               free_lseg(lseg);
+               pnfs_free_lseg(lseg);
        }
 }
 
@@ -458,10 +504,15 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
        lo = nfsi->layout;
        if (lo) {
                lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
-               mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
-       }
-       spin_unlock(&nfsi->vfs_inode.i_lock);
-       pnfs_free_lseg_list(&tmp_list);
+               pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+               pnfs_get_layout_hdr(lo);
+               pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
+               pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
+               spin_unlock(&nfsi->vfs_inode.i_lock);
+               pnfs_free_lseg_list(&tmp_list);
+               pnfs_put_layout_hdr(lo);
+       } else
+               spin_unlock(&nfsi->vfs_inode.i_lock);
 }
 EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
 
@@ -498,46 +549,54 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
        }
 }
 
+/*
+ * Compare 2 layout stateid sequence ids, to see which is newer,
+ * taking into account wraparound issues.
+ */
+static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
+{
+       return (s32)s1 - (s32)s2 > 0;
+}
+
 /* update lo->plh_stateid with new if is more recent */
 void
 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
                        bool update_barrier)
 {
-       u32 oldseq, newseq;
+       u32 oldseq, newseq, new_barrier;
+       int empty = list_empty(&lo->plh_segs);
 
        oldseq = be32_to_cpu(lo->plh_stateid.seqid);
        newseq = be32_to_cpu(new->seqid);
-       if ((int)(newseq - oldseq) > 0) {
+       if (empty || pnfs_seqid_is_newer(newseq, oldseq)) {
                nfs4_stateid_copy(&lo->plh_stateid, new);
                if (update_barrier) {
-                       u32 new_barrier = be32_to_cpu(new->seqid);
-
-                       if ((int)(new_barrier - lo->plh_barrier))
-                               lo->plh_barrier = new_barrier;
+                       new_barrier = be32_to_cpu(new->seqid);
                } else {
                        /* Because of wraparound, we want to keep the barrier
-                        * "close" to the current seqids.  It needs to be
-                        * within 2**31 to count as "behind", so if it
-                        * gets too near that limit, give us a litle leeway
-                        * and bring it to within 2**30.
-                        * NOTE - and yes, this is all unsigned arithmetic.
+                        * "close" to the current seqids.
                         */
-                       if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
-                               lo->plh_barrier = newseq - (1 << 30);
+                       new_barrier = newseq - atomic_read(&lo->plh_outstanding);
                }
+               if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
+                       lo->plh_barrier = new_barrier;
        }
 }
 
+static bool
+pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
+               const nfs4_stateid *stateid)
+{
+       u32 seqid = be32_to_cpu(stateid->seqid);
+
+       return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
+}
+
 /* lget is set to 1 if called from inside send_layoutget call chain */
 static bool
-pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
-                       int lget)
+pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget)
 {
-       if ((stateid) &&
-           (int)(lo->plh_barrier - be32_to_cpu(stateid->seqid)) >= 0)
-               return true;
        return lo->plh_block_lgets ||
-               test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
                (list_empty(&lo->plh_segs) &&
                 (atomic_read(&lo->plh_outstanding) > lget));
@@ -551,7 +610,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 
        dprintk("--> %s\n", __func__);
        spin_lock(&lo->plh_inode->i_lock);
-       if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
+       if (pnfs_layoutgets_blocked(lo, 1)) {
                status = -EAGAIN;
        } else if (list_empty(&lo->plh_segs)) {
                int seq;
@@ -582,7 +641,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        struct inode *ino = lo->plh_inode;
        struct nfs_server *server = NFS_SERVER(ino);
        struct nfs4_layoutget *lgp;
-       struct pnfs_layout_segment *lseg = NULL;
+       struct pnfs_layout_segment *lseg;
 
        dprintk("--> %s\n", __func__);
 
@@ -599,16 +658,22 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        lgp->args.type = server->pnfs_curr_ld->id;
        lgp->args.inode = ino;
        lgp->args.ctx = get_nfs_open_context(ctx);
-       lgp->lsegpp = &lseg;
        lgp->gfp_flags = gfp_flags;
 
        /* Synchronously retrieve layout information from server and
         * store in lseg.
         */
-       nfs4_proc_layoutget(lgp, gfp_flags);
-       if (!lseg) {
-               /* remember that LAYOUTGET failed and suspend trying */
-               set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
+       lseg = nfs4_proc_layoutget(lgp, gfp_flags);
+       if (IS_ERR(lseg)) {
+               switch (PTR_ERR(lseg)) {
+               case -ENOMEM:
+               case -ERESTARTSYS:
+                       break;
+               default:
+                       /* remember that LAYOUTGET failed and suspend trying */
+                       pnfs_layout_io_set_failed(lo, range->iomode);
+               }
+               return NULL;
        }
 
        return lseg;
@@ -636,25 +701,24 @@ _pnfs_return_layout(struct inode *ino)
 
        spin_lock(&ino->i_lock);
        lo = nfsi->layout;
-       if (!lo || pnfs_test_layout_returned(lo)) {
+       if (!lo) {
                spin_unlock(&ino->i_lock);
                dprintk("NFS: %s no layout to return\n", __func__);
                goto out;
        }
        stateid = nfsi->layout->plh_stateid;
        /* Reference matched in nfs4_layoutreturn_release */
-       get_layout_hdr(lo);
+       pnfs_get_layout_hdr(lo);
        empty = list_empty(&lo->plh_segs);
-       mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
+       pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
        /* Don't send a LAYOUTRETURN if list was initially empty */
        if (empty) {
                spin_unlock(&ino->i_lock);
-               put_layout_hdr(lo);
+               pnfs_put_layout_hdr(lo);
                dprintk("NFS: %s no layout segments to return\n", __func__);
                goto out;
        }
        lo->plh_block_lgets++;
-       pnfs_mark_layout_returned(lo);
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&tmp_list);
 
@@ -663,10 +727,10 @@ _pnfs_return_layout(struct inode *ino)
        lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
        if (unlikely(lrp == NULL)) {
                status = -ENOMEM;
-               set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags);
-               set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags);
-               pnfs_clear_layout_returned(lo);
-               put_layout_hdr(lo);
+               spin_lock(&ino->i_lock);
+               lo->plh_block_lgets--;
+               spin_unlock(&ino->i_lock);
+               pnfs_put_layout_hdr(lo);
                goto out;
        }
 
@@ -703,7 +767,7 @@ bool pnfs_roc(struct inode *ino)
        if (!found)
                goto out_nolayout;
        lo->plh_block_lgets++;
-       get_layout_hdr(lo); /* matched in pnfs_roc_release */
+       pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&tmp_list);
        return true;
@@ -720,8 +784,12 @@ void pnfs_roc_release(struct inode *ino)
        spin_lock(&ino->i_lock);
        lo = NFS_I(ino)->layout;
        lo->plh_block_lgets--;
-       put_layout_hdr_locked(lo);
-       spin_unlock(&ino->i_lock);
+       if (atomic_dec_and_test(&lo->plh_refcount)) {
+               pnfs_detach_layout_hdr(lo);
+               spin_unlock(&ino->i_lock);
+               pnfs_free_layout_hdr(lo);
+       } else
+               spin_unlock(&ino->i_lock);
 }
 
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
@@ -730,32 +798,34 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
 
        spin_lock(&ino->i_lock);
        lo = NFS_I(ino)->layout;
-       if ((int)(barrier - lo->plh_barrier) > 0)
+       if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
                lo->plh_barrier = barrier;
        spin_unlock(&ino->i_lock);
 }
 
-bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
+bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
 {
        struct nfs_inode *nfsi = NFS_I(ino);
+       struct pnfs_layout_hdr *lo;
        struct pnfs_layout_segment *lseg;
+       u32 current_seqid;
        bool found = false;
 
        spin_lock(&ino->i_lock);
        list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+                       rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
                        found = true;
-                       break;
+                       goto out;
                }
-       if (!found) {
-               struct pnfs_layout_hdr *lo = nfsi->layout;
-               u32 current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
+       lo = nfsi->layout;
+       current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
 
-               /* Since close does not return a layout stateid for use as
-                * a barrier, we choose the worst-case barrier.
-                */
-               *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
-       }
+       /* Since close does not return a layout stateid for use as
+        * a barrier, we choose the worst-case barrier.
+        */
+       *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
+out:
        spin_unlock(&ino->i_lock);
        return found;
 }
@@ -786,14 +856,13 @@ cmp_layout(struct pnfs_layout_range *l1,
 }
 
 static void
-pnfs_insert_layout(struct pnfs_layout_hdr *lo,
+pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
                   struct pnfs_layout_segment *lseg)
 {
        struct pnfs_layout_segment *lp;
 
        dprintk("%s:Begin\n", __func__);
 
-       assert_spin_locked(&lo->plh_inode->i_lock);
        list_for_each_entry(lp, &lo->plh_segs, pls_list) {
                if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
                        continue;
@@ -813,7 +882,7 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
                __func__, lseg, lseg->pls_range.iomode,
                lseg->pls_range.offset, lseg->pls_range.length);
 out:
-       get_layout_hdr(lo);
+       pnfs_get_layout_hdr(lo);
 
        dprintk("%s:Return\n", __func__);
 }
@@ -847,21 +916,19 @@ pnfs_find_alloc_layout(struct inode *ino,
 
        dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
 
-       assert_spin_locked(&ino->i_lock);
-       if (nfsi->layout) {
-               if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
-                       return NULL;
-               else
-                       return nfsi->layout;
-       }
+       if (nfsi->layout != NULL)
+               goto out_existing;
        spin_unlock(&ino->i_lock);
        new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
        spin_lock(&ino->i_lock);
 
-       if (likely(nfsi->layout == NULL))       /* Won the race? */
+       if (likely(nfsi->layout == NULL)) {     /* Won the race? */
                nfsi->layout = new;
-       else
-               pnfs_free_layout_hdr(new);
+               return new;
+       }
+       pnfs_free_layout_hdr(new);
+out_existing:
+       pnfs_get_layout_hdr(nfsi->layout);
        return nfsi->layout;
 }
 
@@ -904,11 +971,10 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
 
        dprintk("%s:Begin\n", __func__);
 
-       assert_spin_locked(&lo->plh_inode->i_lock);
        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
                    is_matching_lseg(&lseg->pls_range, range)) {
-                       ret = get_lseg(lseg);
+                       ret = pnfs_get_lseg(lseg);
                        break;
                }
                if (lseg->pls_range.offset > range->offset)
@@ -1013,7 +1079,6 @@ pnfs_update_layout(struct inode *ino,
                .length = count,
        };
        unsigned pg_offset;
-       struct nfs_inode *nfsi = NFS_I(ino);
        struct nfs_server *server = NFS_SERVER(ino);
        struct nfs_client *clp = server->nfs_client;
        struct pnfs_layout_hdr *lo;
@@ -1021,16 +1086,16 @@ pnfs_update_layout(struct inode *ino,
        bool first = false;
 
        if (!pnfs_enabled_sb(NFS_SERVER(ino)))
-               return NULL;
+               goto out;
 
        if (pnfs_within_mdsthreshold(ctx, ino, iomode))
-               return NULL;
+               goto out;
 
        spin_lock(&ino->i_lock);
        lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
        if (lo == NULL) {
-               dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
-               goto out_unlock;
+               spin_unlock(&ino->i_lock);
+               goto out;
        }
 
        /* Do we even need to bother with this? */
@@ -1040,7 +1105,7 @@ pnfs_update_layout(struct inode *ino,
        }
 
        /* if LAYOUTGET already failed once we don't try again */
-       if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
+       if (pnfs_layout_io_test_failed(lo, iomode))
                goto out_unlock;
 
        /* Check to see if the layout for the given range already exists */
@@ -1048,17 +1113,13 @@ pnfs_update_layout(struct inode *ino,
        if (lseg)
                goto out_unlock;
 
-       if (pnfs_layoutgets_blocked(lo, NULL, 0))
+       if (pnfs_layoutgets_blocked(lo, 0))
                goto out_unlock;
        atomic_inc(&lo->plh_outstanding);
 
-       get_layout_hdr(lo);
        if (list_empty(&lo->plh_segs))
                first = true;
 
-       /* Enable LAYOUTRETURNs */
-       pnfs_clear_layout_returned(lo);
-
        spin_unlock(&ino->i_lock);
        if (first) {
                /* The lo must be on the clp list if there is any
@@ -1079,24 +1140,26 @@ pnfs_update_layout(struct inode *ino,
                arg.length = PAGE_CACHE_ALIGN(arg.length);
 
        lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
-       if (!lseg && first) {
-               spin_lock(&clp->cl_lock);
-               list_del_init(&lo->plh_layouts);
-               spin_unlock(&clp->cl_lock);
-       }
        atomic_dec(&lo->plh_outstanding);
-       put_layout_hdr(lo);
+out_put_layout_hdr:
+       pnfs_put_layout_hdr(lo);
 out:
-       dprintk("%s end, state 0x%lx lseg %p\n", __func__,
-               nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
+       dprintk("%s: inode %s/%llu pNFS layout segment %s for "
+                       "(%s, offset: %llu, length: %llu)\n",
+                       __func__, ino->i_sb->s_id,
+                       (unsigned long long)NFS_FILEID(ino),
+                       lseg == NULL ? "not found" : "found",
+                       iomode==IOMODE_RW ?  "read/write" : "read-only",
+                       (unsigned long long)pos,
+                       (unsigned long long)count);
        return lseg;
 out_unlock:
        spin_unlock(&ino->i_lock);
-       goto out;
+       goto out_put_layout_hdr;
 }
 EXPORT_SYMBOL_GPL(pnfs_update_layout);
 
-int
+struct pnfs_layout_segment *
 pnfs_layout_process(struct nfs4_layoutget *lgp)
 {
        struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
@@ -1123,25 +1186,29 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
                goto out_forget_reply;
        }
 
-       if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
+       if (pnfs_layoutgets_blocked(lo, 1) ||
+           pnfs_layout_stateid_blocked(lo, &res->stateid)) {
                dprintk("%s forget reply due to state\n", __func__);
                goto out_forget_reply;
        }
+
+       /* Done processing layoutget. Set the layout stateid */
+       pnfs_set_layout_stateid(lo, &res->stateid, false);
+
        init_lseg(lo, lseg);
        lseg->pls_range = res->range;
-       *lgp->lsegpp = get_lseg(lseg);
-       pnfs_insert_layout(lo, lseg);
+       pnfs_get_lseg(lseg);
+       pnfs_layout_insert_lseg(lo, lseg);
 
        if (res->return_on_close) {
                set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
                set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
        }
 
-       /* Done processing layoutget. Set the layout stateid */
-       pnfs_set_layout_stateid(lo, &res->stateid, false);
        spin_unlock(&ino->i_lock);
+       return lseg;
 out:
-       return status;
+       return ERR_PTR(status);
 
 out_forget_reply:
        spin_unlock(&ino->i_lock);
@@ -1153,16 +1220,24 @@ out_forget_reply:
 void
 pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
+       u64 rd_size = req->wb_bytes;
+
        BUG_ON(pgio->pg_lseg != NULL);
 
        if (req->wb_offset != req->wb_pgbase) {
                nfs_pageio_reset_read_mds(pgio);
                return;
        }
+
+       if (pgio->pg_dreq == NULL)
+               rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
+       else
+               rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+
        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                           req->wb_context,
                                           req_offset(req),
-                                          req->wb_bytes,
+                                          rd_size,
                                           IOMODE_READ,
                                           GFP_KERNEL);
        /* If no lseg, fall back to read through mds */
@@ -1173,7 +1248,8 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
 
 void
-pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
+                          struct nfs_page *req, u64 wb_size)
 {
        BUG_ON(pgio->pg_lseg != NULL);
 
@@ -1181,10 +1257,11 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *
                nfs_pageio_reset_write_mds(pgio);
                return;
        }
+
        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                           req->wb_context,
                                           req_offset(req),
-                                          req->wb_bytes,
+                                          wb_size,
                                           IOMODE_RW,
                                           GFP_NOFS);
        /* If no lseg, fall back to write through mds */
@@ -1362,12 +1439,12 @@ pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *he
                if (trypnfs == PNFS_NOT_ATTEMPTED)
                        pnfs_write_through_mds(desc, data);
        }
-       put_lseg(lseg);
+       pnfs_put_lseg(lseg);
 }
 
 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
-       put_lseg(hdr->lseg);
+       pnfs_put_lseg(hdr->lseg);
        nfs_writehdr_free(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
@@ -1382,17 +1459,17 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
        whdr = nfs_writehdr_alloc();
        if (!whdr) {
                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
-               put_lseg(desc->pg_lseg);
+               pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
                return -ENOMEM;
        }
        hdr = &whdr->header;
        nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
-       hdr->lseg = get_lseg(desc->pg_lseg);
+       hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
        atomic_inc(&hdr->refcnt);
        ret = nfs_generic_flush(desc, hdr);
        if (ret != 0) {
-               put_lseg(desc->pg_lseg);
+               pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
        } else
                pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
@@ -1517,12 +1594,12 @@ pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *hea
                if (trypnfs == PNFS_NOT_ATTEMPTED)
                        pnfs_read_through_mds(desc, data);
        }
-       put_lseg(lseg);
+       pnfs_put_lseg(lseg);
 }
 
 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
 {
-       put_lseg(hdr->lseg);
+       pnfs_put_lseg(hdr->lseg);
        nfs_readhdr_free(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
@@ -1538,17 +1615,17 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
        if (!rhdr) {
                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
                ret = -ENOMEM;
-               put_lseg(desc->pg_lseg);
+               pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
                return ret;
        }
        hdr = &rhdr->header;
        nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
-       hdr->lseg = get_lseg(desc->pg_lseg);
+       hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
        atomic_inc(&hdr->refcnt);
        ret = nfs_generic_pagein(desc, hdr);
        if (ret != 0) {
-               put_lseg(desc->pg_lseg);
+               pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
        } else
                pnfs_do_multiple_reads(desc, &hdr->rpc_list);
@@ -1574,13 +1651,7 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
 
 void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
 {
-       if (lseg->pls_range.iomode == IOMODE_RW) {
-               dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
-               set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-       } else {
-               dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
-               set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
-       }
+       pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
 }
 EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
 
@@ -1601,7 +1672,7 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
        }
        if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) {
                /* references matched in nfs4_layoutcommit_release */
-               get_lseg(hdr->lseg);
+               pnfs_get_lseg(hdr->lseg);
        }
        if (end_pos > nfsi->layout->plh_lwb)
                nfsi->layout->plh_lwb = end_pos;