]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Merge branch 'nfs-for-2.6.37' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 26 Oct 2010 16:52:09 +0000 (09:52 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 26 Oct 2010 16:52:09 +0000 (09:52 -0700)
* 'nfs-for-2.6.37' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6:
  net/sunrpc: Use static const char arrays
  nfs4: fix channel attribute sanity-checks
  NFSv4.1: Use more sensible names for 'initialize_mountpoint'
  NFSv4.1: pnfs: filelayout: add driver's LAYOUTGET and GETDEVICEINFO infrastructure
  NFSv4.1: pnfs: add LAYOUTGET and GETDEVICEINFO infrastructure
  NFS: client needs to maintain list of inodes with active layouts
  NFS: create and destroy inode's layout cache
  NFSv4.1: pnfs: filelayout: introduce minimal file layout driver
  NFSv4.1: pnfs: full mount/umount infrastructure
  NFS: set layout driver
  NFS: ask for layouttypes during v4 fsinfo call
  NFS: change stateid to be a union
  NFSv4.1: pnfsd, pnfs: protocol level pnfs constants
  SUNRPC: define xdr_decode_opaque_fixed
  NFSD: remove duplicate NFS4_STATEID_SIZE

24 files changed:
Documentation/filesystems/nfs/00-INDEX
Documentation/filesystems/nfs/pnfs.txt [new file with mode: 0644]
fs/nfs/Kconfig
fs/nfs/Makefile
fs/nfs/callback_proc.c
fs/nfs/client.c
fs/nfs/file.c
fs/nfs/inode.c
fs/nfs/nfs4filelayout.c [new file with mode: 0644]
fs/nfs/nfs4filelayout.h [new file with mode: 0644]
fs/nfs/nfs4filelayoutdev.c [new file with mode: 0644]
fs/nfs/nfs4proc.c
fs/nfs/nfs4state.c
fs/nfs/nfs4xdr.c
fs/nfs/pnfs.c [new file with mode: 0644]
fs/nfs/pnfs.h [new file with mode: 0644]
fs/nfs/read.c
fs/nfsd/nfs4callback.c
include/linux/nfs4.h
include/linux/nfs_fs.h
include/linux/nfs_fs_sb.h
include/linux/nfs_xdr.h
include/linux/sunrpc/xdr.h
net/sunrpc/auth_gss/gss_krb5_mech.c

index 3225a56621146cb68c328a6ea96c7ca40c5b7abd..a57e12411d2a2570511df65500989110121e476e 100644 (file)
@@ -12,6 +12,8 @@ nfs-rdma.txt
        - how to install and setup the Linux NFS/RDMA client and server software
 nfsroot.txt
        - short guide on setting up a diskless box with NFS root filesystem.
+pnfs.txt
+       - short explanation of some of the internals of the pnfs client code
 rpc-cache.txt
        - introduction to the caching mechanisms in the sunrpc layer.
 idmapper.txt
diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
new file mode 100644 (file)
index 0000000..bc0b9cf
--- /dev/null
@@ -0,0 +1,48 @@
+Reference counting in pnfs:
+==========================
+
+The are several inter-related caches.  We have layouts which can
+reference multiple devices, each of which can reference multiple data servers.
+Each data server can be referenced by multiple devices.  Each device
+can be referenced by multiple layouts.  To keep all of this straight,
+we need to reference count.
+
+
+struct pnfs_layout_hdr
+----------------------
+The on-the-wire command LAYOUTGET corresponds to struct
+pnfs_layout_segment, usually referred to by the variable name lseg.
+Each nfs_inode may hold a pointer to a cache of of these layout
+segments in nfsi->layout, of type struct pnfs_layout_hdr.
+
+We reference the header for the inode pointing to it, across each
+outstanding RPC call that references it (LAYOUTGET, LAYOUTRETURN,
+LAYOUTCOMMIT), and for each lseg held within.
+
+Each header is also (when non-empty) put on a list associated with
+struct nfs_client (cl_layouts).  Being put on this list does not bump
+the reference count, as the layout is kept around by the lseg that
+keeps it in the list.
+
+deviceid_cache
+--------------
+lsegs reference device ids, which are resolved per nfs_client and
+layout driver type.  The device ids are held in a RCU cache (struct
+nfs4_deviceid_cache).  The cache itself is referenced across each
+mount.  The entries (struct nfs4_deviceid) themselves are held across
+the lifetime of each lseg referencing them.
+
+RCU is used because the deviceid is basically a write once, read many
+data structure.  The hlist size of 32 buckets needs better
+justification, but seems reasonable given that we can have multiple
+deviceid's per filesystem, and multiple filesystems per nfs_client.
+
+The hash code is copied from the nfsd code base.  A discussion of
+hashing and variations of this algorithm can be found at:
+http://groups.google.com/group/comp.lang.c/browse_thread/thread/9522965e2b8d3809
+
+data server cache
+-----------------
+file driver devices refer to data servers, which are kept in a module
+level cache.  Its reference is held over the lifetime of the deviceid
+pointing to it.
index 5c55c26af165195b6dd940940b7ee95c417c22b5..fd667652c5026daf78c878941da5f950dfdb4a5b 100644 (file)
@@ -77,13 +77,17 @@ config NFS_V4
 
 config NFS_V4_1
        bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
-       depends on NFS_V4 && EXPERIMENTAL
+       depends on NFS_FS && NFS_V4 && EXPERIMENTAL
+       select PNFS_FILE_LAYOUT
        help
          This option enables support for minor version 1 of the NFSv4 protocol
-         (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
+         (RFC 5661) in the kernel's NFS client.
 
          If unsure, say N.
 
+config PNFS_FILE_LAYOUT
+       tristate
+
 config ROOT_NFS
        bool "Root file system on NFS"
        depends on NFS_FS=y && IP_PNP
index da7fda639eac6445d3a34193610ca10f8f302e01..4776ff9e38143d5f3c01744d8cd7261c2a62553c 100644 (file)
@@ -15,5 +15,9 @@ nfs-$(CONFIG_NFS_V4)  += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
                           delegation.o idmap.o \
                           callback.o callback_xdr.o callback_proc.o \
                           nfs4namespace.o
+nfs-$(CONFIG_NFS_V4_1) += pnfs.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
+
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
+nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
index 930d10fecdaff8632dc35869c3695fc8397e8d7a..2950fca0c61b22cd0997d7c136cde40b90a0ae71 100644 (file)
@@ -118,11 +118,11 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n
        if (delegation == NULL)
                return 0;
 
-       /* seqid is 4-bytes long */
-       if (((u32 *) &stateid->data)[0] != 0)
+       if (stateid->stateid.seqid != 0)
                return 0;
-       if (memcmp(&delegation->stateid.data[4], &stateid->data[4],
-                  sizeof(stateid->data)-4))
+       if (memcmp(&delegation->stateid.stateid.other,
+                  &stateid->stateid.other,
+                  NFS4_STATEID_OTHER_SIZE))
                return 0;
 
        return 1;
index a882785eba41efdfc05e45773bdc23bb525a48be..fd6f0a70021b9fa8c1918372f2cd62d22636dfb9 100644 (file)
@@ -48,6 +48,7 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY                NFSDBG_CLIENT
 
@@ -155,7 +156,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        cred = rpc_lookup_machine_cred();
        if (!IS_ERR(cred))
                clp->cl_machine_cred = cred;
-
+#if defined(CONFIG_NFS_V4_1)
+       INIT_LIST_HEAD(&clp->cl_layouts);
+#endif
        nfs_fscache_get_client_cookie(clp);
 
        return clp;
@@ -252,6 +255,7 @@ void nfs_put_client(struct nfs_client *clp)
                nfs_free_client(clp);
        }
 }
+EXPORT_SYMBOL_GPL(nfs_put_client);
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 /*
@@ -900,6 +904,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
        if (server->wsize > NFS_MAX_FILE_IO_SIZE)
                server->wsize = NFS_MAX_FILE_IO_SIZE;
        server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       set_pnfs_layoutdriver(server, fsinfo->layouttype);
+
        server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
 
        server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
@@ -939,6 +945,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
        }
 
        fsinfo.fattr = fattr;
+       fsinfo.layouttype = 0;
        error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
        if (error < 0)
                goto out_error;
@@ -1021,6 +1028,7 @@ void nfs_free_server(struct nfs_server *server)
 {
        dprintk("--> nfs_free_server()\n");
 
+       unset_pnfs_layoutdriver(server);
        spin_lock(&nfs_client_lock);
        list_del(&server->client_link);
        list_del(&server->master_link);
index e18c31e08a2897fb5c2a7990a0e34147845e37ad..e756075637b005ba1937b275c2d77ed3ff8ad676 100644 (file)
@@ -36,6 +36,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY                NFSDBG_FILE
 
@@ -386,6 +387,10 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
                file->f_path.dentry->d_name.name,
                mapping->host->i_ino, len, (long long) pos);
 
+       pnfs_update_layout(mapping->host,
+                          nfs_file_open_context(file),
+                          IOMODE_RW);
+
 start:
        /*
         * Prevent starvation issues if someone is doing a consistency
index 6eec2865641526ee5be081e6ceb317a93ec987a4..314f57164602eda0762c0a226db44aa19be461f5 100644 (file)
@@ -48,6 +48,7 @@
 #include "internal.h"
 #include "fscache.h"
 #include "dns_resolve.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY                NFSDBG_VFS
 
@@ -1410,6 +1411,7 @@ void nfs4_evict_inode(struct inode *inode)
 {
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
+       pnfs_destroy_layout(NFS_I(inode));
        /* If we are holding a delegation, return it! */
        nfs_inode_return_delegation_noreclaim(inode);
        /* First call standard NFS clear_inode() code */
@@ -1447,6 +1449,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
        nfsi->delegation = NULL;
        nfsi->delegation_state = 0;
        init_rwsem(&nfsi->rwsem);
+       nfsi->layout = NULL;
 #endif
 }
 
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
new file mode 100644 (file)
index 0000000..2e92f0d
--- /dev/null
@@ -0,0 +1,280 @@
+/*
+ *  Module for the pnfs nfs4 file layout driver.
+ *  Defines all I/O and Policy interface operations, plus code
+ *  to register itself with the pNFS client.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/nfs_fs.h>
+
+#include "internal.h"
+#include "nfs4filelayout.h"
+
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
+MODULE_DESCRIPTION("The NFSv4 file layout driver");
+
+static int
+filelayout_set_layoutdriver(struct nfs_server *nfss)
+{
+       int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
+                                               nfs4_fl_free_deviceid_callback);
+       if (status) {
+               printk(KERN_WARNING "%s: deviceid cache could not be "
+                       "initialized\n", __func__);
+               return status;
+       }
+       dprintk("%s: deviceid cache has been initialized successfully\n",
+               __func__);
+       return 0;
+}
+
+/* Clear out the layout by destroying its device list */
+static int
+filelayout_clear_layoutdriver(struct nfs_server *nfss)
+{
+       dprintk("--> %s\n", __func__);
+
+       if (nfss->nfs_client->cl_devid_cache)
+               pnfs_put_deviceid_cache(nfss->nfs_client);
+       return 0;
+}
+
+/*
+ * filelayout_check_layout()
+ *
+ * Make sure layout segment parameters are sane WRT the device.
+ * At this point no generic layer initialization of the lseg has occurred,
+ * and nothing has been added to the layout_hdr cache.
+ *
+ */
+static int
+filelayout_check_layout(struct pnfs_layout_hdr *lo,
+                       struct nfs4_filelayout_segment *fl,
+                       struct nfs4_layoutget_res *lgr,
+                       struct nfs4_deviceid *id)
+{
+       struct nfs4_file_layout_dsaddr *dsaddr;
+       int status = -EINVAL;
+       struct nfs_server *nfss = NFS_SERVER(lo->inode);
+
+       dprintk("--> %s\n", __func__);
+
+       if (fl->pattern_offset > lgr->range.offset) {
+               dprintk("%s pattern_offset %lld to large\n",
+                               __func__, fl->pattern_offset);
+               goto out;
+       }
+
+       if (fl->stripe_unit % PAGE_SIZE) {
+               dprintk("%s Stripe unit (%u) not page aligned\n",
+                       __func__, fl->stripe_unit);
+               goto out;
+       }
+
+       /* find and reference the deviceid */
+       dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
+       if (dsaddr == NULL) {
+               dsaddr = get_device_info(lo->inode, id);
+               if (dsaddr == NULL)
+                       goto out;
+       }
+       fl->dsaddr = dsaddr;
+
+       if (fl->first_stripe_index < 0 ||
+           fl->first_stripe_index >= dsaddr->stripe_count) {
+               dprintk("%s Bad first_stripe_index %d\n",
+                               __func__, fl->first_stripe_index);
+               goto out_put;
+       }
+
+       if ((fl->stripe_type == STRIPE_SPARSE &&
+           fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) ||
+           (fl->stripe_type == STRIPE_DENSE &&
+           fl->num_fh != dsaddr->stripe_count)) {
+               dprintk("%s num_fh %u not valid for given packing\n",
+                       __func__, fl->num_fh);
+               goto out_put;
+       }
+
+       if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
+               dprintk("%s Stripe unit (%u) not aligned with rsize %u "
+                       "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
+                       nfss->wsize);
+       }
+
+       status = 0;
+out:
+       dprintk("--> %s returns %d\n", __func__, status);
+       return status;
+out_put:
+       pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
+       goto out;
+}
+
+static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
+{
+       int i;
+
+       for (i = 0; i < fl->num_fh; i++) {
+               if (!fl->fh_array[i])
+                       break;
+               kfree(fl->fh_array[i]);
+       }
+       kfree(fl->fh_array);
+       fl->fh_array = NULL;
+}
+
+static void
+_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
+{
+       filelayout_free_fh_array(fl);
+       kfree(fl);
+}
+
+static int
+filelayout_decode_layout(struct pnfs_layout_hdr *flo,
+                        struct nfs4_filelayout_segment *fl,
+                        struct nfs4_layoutget_res *lgr,
+                        struct nfs4_deviceid *id)
+{
+       uint32_t *p = (uint32_t *)lgr->layout.buf;
+       uint32_t nfl_util;
+       int i;
+
+       dprintk("%s: set_layout_map Begin\n", __func__);
+
+       memcpy(id, p, sizeof(*id));
+       p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+       print_deviceid(id);
+
+       nfl_util = be32_to_cpup(p++);
+       if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
+               fl->commit_through_mds = 1;
+       if (nfl_util & NFL4_UFLG_DENSE)
+               fl->stripe_type = STRIPE_DENSE;
+       else
+               fl->stripe_type = STRIPE_SPARSE;
+       fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK;
+
+       fl->first_stripe_index = be32_to_cpup(p++);
+       p = xdr_decode_hyper(p, &fl->pattern_offset);
+       fl->num_fh = be32_to_cpup(p++);
+
+       dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n",
+               __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
+               fl->pattern_offset);
+
+       fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
+                              GFP_KERNEL);
+       if (!fl->fh_array)
+               return -ENOMEM;
+
+       for (i = 0; i < fl->num_fh; i++) {
+               /* Do we want to use a mempool here? */
+               fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
+               if (!fl->fh_array[i]) {
+                       filelayout_free_fh_array(fl);
+                       return -ENOMEM;
+               }
+               fl->fh_array[i]->size = be32_to_cpup(p++);
+               if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
+                       printk(KERN_ERR "Too big fh %d received %d\n",
+                              i, fl->fh_array[i]->size);
+                       filelayout_free_fh_array(fl);
+                       return -EIO;
+               }
+               memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
+               p += XDR_QUADLEN(fl->fh_array[i]->size);
+               dprintk("DEBUG: %s: fh len %d\n", __func__,
+                       fl->fh_array[i]->size);
+       }
+
+       return 0;
+}
+
+static struct pnfs_layout_segment *
+filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
+                     struct nfs4_layoutget_res *lgr)
+{
+       struct nfs4_filelayout_segment *fl;
+       int rc;
+       struct nfs4_deviceid id;
+
+       dprintk("--> %s\n", __func__);
+       fl = kzalloc(sizeof(*fl), GFP_KERNEL);
+       if (!fl)
+               return NULL;
+
+       rc = filelayout_decode_layout(layoutid, fl, lgr, &id);
+       if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) {
+               _filelayout_free_lseg(fl);
+               return NULL;
+       }
+       return &fl->generic_hdr;
+}
+
+static void
+filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+       struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode);
+       struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+
+       dprintk("--> %s\n", __func__);
+       pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
+                         &fl->dsaddr->deviceid);
+       _filelayout_free_lseg(fl);
+}
+
+static struct pnfs_layoutdriver_type filelayout_type = {
+       .id = LAYOUT_NFSV4_1_FILES,
+       .name = "LAYOUT_NFSV4_1_FILES",
+       .owner = THIS_MODULE,
+       .set_layoutdriver = filelayout_set_layoutdriver,
+       .clear_layoutdriver = filelayout_clear_layoutdriver,
+       .alloc_lseg              = filelayout_alloc_lseg,
+       .free_lseg               = filelayout_free_lseg,
+};
+
+static int __init nfs4filelayout_init(void)
+{
+       printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n",
+              __func__);
+       return pnfs_register_layoutdriver(&filelayout_type);
+}
+
+static void __exit nfs4filelayout_exit(void)
+{
+       printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n",
+              __func__);
+       pnfs_unregister_layoutdriver(&filelayout_type);
+}
+
+module_init(nfs4filelayout_init);
+module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
new file mode 100644 (file)
index 0000000..bbf60dd
--- /dev/null
@@ -0,0 +1,94 @@
+/*
+ *  NFSv4 file layout driver data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#ifndef FS_NFS_NFS4FILELAYOUT_H
+#define FS_NFS_NFS4FILELAYOUT_H
+
+#include "pnfs.h"
+
+/*
+ * Field testing shows we need to support upto 4096 stripe indices.
+ * We store each index as a u8 (u32 on the wire) to keep the memory footprint
+ * reasonable. This in turn means we support a maximum of 256
+ * RFC 5661 multipath_list4 structures.
+ */
+#define NFS4_PNFS_MAX_STRIPE_CNT 4096
+#define NFS4_PNFS_MAX_MULTI_CNT  256 /* 256 fit into a u8 stripe_index */
+
+enum stripetype4 {
+       STRIPE_SPARSE = 1,
+       STRIPE_DENSE = 2
+};
+
+/* Individual ip address */
+struct nfs4_pnfs_ds {
+       struct list_head        ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
+       u32                     ds_ip_addr;
+       u32                     ds_port;
+       struct nfs_client       *ds_clp;
+       atomic_t                ds_count;
+};
+
+struct nfs4_file_layout_dsaddr {
+       struct pnfs_deviceid_node       deviceid;
+       u32                             stripe_count;
+       u8                              *stripe_indices;
+       u32                             ds_num;
+       struct nfs4_pnfs_ds             *ds_list[1];
+};
+
+struct nfs4_filelayout_segment {
+       struct pnfs_layout_segment generic_hdr;
+       u32 stripe_type;
+       u32 commit_through_mds;
+       u32 stripe_unit;
+       u32 first_stripe_index;
+       u64 pattern_offset;
+       struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
+       unsigned int num_fh;
+       struct nfs_fh **fh_array;
+};
+
+static inline struct nfs4_filelayout_segment *
+FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+       return container_of(lseg,
+                           struct nfs4_filelayout_segment,
+                           generic_hdr);
+}
+
+extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
+extern void print_ds(struct nfs4_pnfs_ds *ds);
+extern void print_deviceid(struct nfs4_deviceid *dev_id);
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
+struct nfs4_file_layout_dsaddr *
+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
+
+#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
new file mode 100644 (file)
index 0000000..51fe64a
--- /dev/null
@@ -0,0 +1,448 @@
+/*
+ *  Device operations for the pnfs nfs4 file layout driver.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *  Garth Goodson   <Garth.Goodson@netapp.com>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+
+#include "internal.h"
+#include "nfs4filelayout.h"
+
+#define NFSDBG_FACILITY                NFSDBG_PNFS_LD
+
+/*
+ * Data server cache
+ *
+ * Data servers can be mapped to different device ids.
+ * nfs4_pnfs_ds reference counting
+ *   - set to 1 on allocation
+ *   - incremented when a device id maps a data server already in the cache.
+ *   - decremented when deviceid is removed from the cache.
+ */
+DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static LIST_HEAD(nfs4_data_server_cache);
+
+/* Debug routines */
+void
+print_ds(struct nfs4_pnfs_ds *ds)
+{
+       if (ds == NULL) {
+               printk("%s NULL device\n", __func__);
+               return;
+       }
+       printk("        ip_addr %x port %hu\n"
+               "        ref count %d\n"
+               "        client %p\n"
+               "        cl_exchange_flags %x\n",
+               ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+               atomic_read(&ds->ds_count), ds->ds_clp,
+               ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+}
+
+void
+print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+       int i;
+
+       ifdebug(FACILITY) {
+               printk("%s dsaddr->ds_num %d\n", __func__,
+                      dsaddr->ds_num);
+               for (i = 0; i < dsaddr->ds_num; i++)
+                       print_ds(dsaddr->ds_list[i]);
+       }
+}
+
+void print_deviceid(struct nfs4_deviceid *id)
+{
+       u32 *p = (u32 *)id;
+
+       dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+               p[0], p[1], p[2], p[3]);
+}
+
+/* nfs4_ds_cache_lock is held */
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(u32 ip_addr, u32 port)
+{
+       struct nfs4_pnfs_ds *ds;
+
+       dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
+                       ntohl(ip_addr), ntohs(port));
+
+       list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
+               if (ds->ds_ip_addr == ip_addr &&
+                   ds->ds_port == port) {
+                       return ds;
+               }
+       }
+       return NULL;
+}
+
+static void
+destroy_ds(struct nfs4_pnfs_ds *ds)
+{
+       dprintk("--> %s\n", __func__);
+       ifdebug(FACILITY)
+               print_ds(ds);
+
+       if (ds->ds_clp)
+               nfs_put_client(ds->ds_clp);
+       kfree(ds);
+}
+
+static void
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+       struct nfs4_pnfs_ds *ds;
+       int i;
+
+       print_deviceid(&dsaddr->deviceid.de_id);
+
+       for (i = 0; i < dsaddr->ds_num; i++) {
+               ds = dsaddr->ds_list[i];
+               if (ds != NULL) {
+                       if (atomic_dec_and_lock(&ds->ds_count,
+                                               &nfs4_ds_cache_lock)) {
+                               list_del_init(&ds->ds_node);
+                               spin_unlock(&nfs4_ds_cache_lock);
+                               destroy_ds(ds);
+                       }
+               }
+       }
+       kfree(dsaddr->stripe_indices);
+       kfree(dsaddr);
+}
+
+void
+nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
+{
+       struct nfs4_file_layout_dsaddr *dsaddr =
+               container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
+
+       nfs4_fl_free_deviceid(dsaddr);
+}
+
+static struct nfs4_pnfs_ds *
+nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
+{
+       struct nfs4_pnfs_ds *tmp_ds, *ds;
+
+       ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL);
+       if (!ds)
+               goto out;
+
+       spin_lock(&nfs4_ds_cache_lock);
+       tmp_ds = _data_server_lookup_locked(ip_addr, port);
+       if (tmp_ds == NULL) {
+               ds->ds_ip_addr = ip_addr;
+               ds->ds_port = port;
+               atomic_set(&ds->ds_count, 1);
+               INIT_LIST_HEAD(&ds->ds_node);
+               ds->ds_clp = NULL;
+               list_add(&ds->ds_node, &nfs4_data_server_cache);
+               dprintk("%s add new data server ip 0x%x\n", __func__,
+                       ds->ds_ip_addr);
+       } else {
+               kfree(ds);
+               atomic_inc(&tmp_ds->ds_count);
+               dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
+                       __func__, tmp_ds->ds_ip_addr,
+                       atomic_read(&tmp_ds->ds_count));
+               ds = tmp_ds;
+       }
+       spin_unlock(&nfs4_ds_cache_lock);
+out:
+       return ds;
+}
+
+/*
+ * Currently only support ipv4, and one multi-path address.
+ */
+static struct nfs4_pnfs_ds *
+decode_and_add_ds(__be32 **pp, struct inode *inode)
+{
+       struct nfs4_pnfs_ds *ds = NULL;
+       char *buf;
+       const char *ipend, *pstr;
+       u32 ip_addr, port;
+       int nlen, rlen, i;
+       int tmp[2];
+       __be32 *r_netid, *r_addr, *p = *pp;
+
+       /* r_netid */
+       nlen = be32_to_cpup(p++);
+       r_netid = p;
+       p += XDR_QUADLEN(nlen);
+
+       /* r_addr */
+       rlen = be32_to_cpup(p++);
+       r_addr = p;
+       p += XDR_QUADLEN(rlen);
+       *pp = p;
+
+       /* Check that netid is "tcp" */
+       if (nlen != 3 ||  memcmp((char *)r_netid, "tcp", 3)) {
+               dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
+               goto out_err;
+       }
+
+       /* ipv6 length plus port is legal */
+       if (rlen > INET6_ADDRSTRLEN + 8) {
+               dprintk("%s Invalid address, length %d\n", __func__,
+                       rlen);
+               goto out_err;
+       }
+       buf = kmalloc(rlen + 1, GFP_KERNEL);
+       buf[rlen] = '\0';
+       memcpy(buf, r_addr, rlen);
+
+       /* replace the port dots with dashes for the in4_pton() delimiter*/
+       for (i = 0; i < 2; i++) {
+               char *res = strrchr(buf, '.');
+               *res = '-';
+       }
+
+       /* Currently only support ipv4 address */
+       if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
+               dprintk("%s: Only ipv4 addresses supported\n", __func__);
+               goto out_free;
+       }
+
+       /* port */
+       pstr = ipend;
+       sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
+       port = htons((tmp[0] << 8) | (tmp[1]));
+
+       ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
+       dprintk("%s Decoded address and port %s\n", __func__, buf);
+out_free:
+       kfree(buf);
+out_err:
+       return ds;
+}
+
+/* Decode opaque device data and return the result */
+static struct nfs4_file_layout_dsaddr*
+decode_device(struct inode *ino, struct pnfs_device *pdev)
+{
+       int i, dummy;
+       u32 cnt, num;
+       u8 *indexp;
+       __be32 *p = (__be32 *)pdev->area, *indicesp;
+       struct nfs4_file_layout_dsaddr *dsaddr;
+
+       /* Get the stripe count (number of stripe index) */
+       cnt = be32_to_cpup(p++);
+       dprintk("%s stripe count  %d\n", __func__, cnt);
+       if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
+               printk(KERN_WARNING "%s: stripe count %d greater than "
+                      "supported maximum %d\n", __func__,
+                       cnt, NFS4_PNFS_MAX_STRIPE_CNT);
+               goto out_err;
+       }
+
+       /* Check the multipath list count */
+       indicesp = p;
+       p += XDR_QUADLEN(cnt << 2);
+       num = be32_to_cpup(p++);
+       dprintk("%s ds_num %u\n", __func__, num);
+       if (num > NFS4_PNFS_MAX_MULTI_CNT) {
+               printk(KERN_WARNING "%s: multipath count %d greater than "
+                       "supported maximum %d\n", __func__,
+                       num, NFS4_PNFS_MAX_MULTI_CNT);
+               goto out_err;
+       }
+       dsaddr = kzalloc(sizeof(*dsaddr) +
+                       (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
+                       GFP_KERNEL);
+       if (!dsaddr)
+               goto out_err;
+
+       dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
+       if (!dsaddr->stripe_indices)
+               goto out_err_free;
+
+       dsaddr->stripe_count = cnt;
+       dsaddr->ds_num = num;
+
+       memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
+
+       /* Go back an read stripe indices */
+       p = indicesp;
+       indexp = &dsaddr->stripe_indices[0];
+       for (i = 0; i < dsaddr->stripe_count; i++) {
+               *indexp = be32_to_cpup(p++);
+               if (*indexp >= num)
+                       goto out_err_free;
+               indexp++;
+       }
+       /* Skip already read multipath list count */
+       p++;
+
+       for (i = 0; i < dsaddr->ds_num; i++) {
+               int j;
+
+               dummy = be32_to_cpup(p++); /* multipath count */
+               if (dummy > 1) {
+                       printk(KERN_WARNING
+                              "%s: Multipath count %d not supported, "
+                              "skipping all greater than 1\n", __func__,
+                               dummy);
+               }
+               for (j = 0; j < dummy; j++) {
+                       if (j == 0) {
+                               dsaddr->ds_list[i] = decode_and_add_ds(&p, ino);
+                               if (dsaddr->ds_list[i] == NULL)
+                                       goto out_err_free;
+                       } else {
+                               u32 len;
+                               /* skip extra multipath */
+                               len = be32_to_cpup(p++);
+                               p += XDR_QUADLEN(len);
+                               len = be32_to_cpup(p++);
+                               p += XDR_QUADLEN(len);
+                               continue;
+                       }
+               }
+       }
+       return dsaddr;
+
+out_err_free:
+       nfs4_fl_free_deviceid(dsaddr);
+out_err:
+       dprintk("%s ERROR: returning NULL\n", __func__);
+       return NULL;
+}
+
+/*
+ * Decode the opaque device specified in 'dev'
+ * and add it to the list of available devices.
+ * If the deviceid is already cached, nfs4_add_deviceid will return
+ * a pointer to the cached struct and throw away the new.
+ */
+static struct nfs4_file_layout_dsaddr*
+decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
+{
+       struct nfs4_file_layout_dsaddr *dsaddr;
+       struct pnfs_deviceid_node *d;
+
+       dsaddr = decode_device(inode, dev);
+       if (!dsaddr) {
+               printk(KERN_WARNING "%s: Could not decode or add device\n",
+                       __func__);
+               return NULL;
+       }
+
+       d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
+                             &dsaddr->deviceid);
+
+       return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+}
+
+/*
+ * Retrieve the information for dev_id, add it to the list
+ * of available devices, and return it.
+ */
+struct nfs4_file_layout_dsaddr *
+get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
+{
+       struct pnfs_device *pdev = NULL;
+       u32 max_resp_sz;
+       int max_pages;
+       struct page **pages = NULL;
+       struct nfs4_file_layout_dsaddr *dsaddr = NULL;
+       int rc, i;
+       struct nfs_server *server = NFS_SERVER(inode);
+
+       /*
+        * Use the session max response size as the basis for setting
+        * GETDEVICEINFO's maxcount
+        */
+       max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+       max_pages = max_resp_sz >> PAGE_SHIFT;
+       dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
+               __func__, inode, max_resp_sz, max_pages);
+
+       pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL);
+       if (pdev == NULL)
+               return NULL;
+
+       pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
+       if (pages == NULL) {
+               kfree(pdev);
+               return NULL;
+       }
+       for (i = 0; i < max_pages; i++) {
+               pages[i] = alloc_page(GFP_KERNEL);
+               if (!pages[i])
+                       goto out_free;
+       }
+
+       /* set pdev->area */
+       pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
+       if (!pdev->area)
+               goto out_free;
+
+       memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+       pdev->layout_type = LAYOUT_NFSV4_1_FILES;
+       pdev->pages = pages;
+       pdev->pgbase = 0;
+       pdev->pglen = PAGE_SIZE * max_pages;
+       pdev->mincount = 0;
+
+       rc = nfs4_proc_getdeviceinfo(server, pdev);
+       dprintk("%s getdevice info returns %d\n", __func__, rc);
+       if (rc)
+               goto out_free;
+
+       /*
+        * Found new device, need to decode it and then add it to the
+        * list of known devices for this mountpoint.
+        */
+       dsaddr = decode_and_add_device(inode, pdev);
+out_free:
+       if (pdev->area != NULL)
+               vunmap(pdev->area);
+       for (i = 0; i < max_pages; i++)
+               __free_page(pages[i]);
+       kfree(pages);
+       kfree(pdev);
+       dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
+       return dsaddr;
+}
+
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
+{
+       struct pnfs_deviceid_node *d;
+
+       d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
+       return (d == NULL) ? NULL :
+               container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+}
index e87fe612ca184426d13118377c5f2f1f440e6af4..32c8758c99fd78b6a44ea61c37f9f9fa7e57f6e3 100644 (file)
@@ -55,6 +55,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "callback.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY                NFSDBG_PROC
 
@@ -130,6 +131,7 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
                        | FATTR4_WORD0_MAXWRITE
                        | FATTR4_WORD0_LEASE_TIME,
                        FATTR4_WORD1_TIME_DELTA
+                       | FATTR4_WORD1_FS_LAYOUT_TYPES
 };
 
 const u32 nfs4_fs_locations_bitmap[2] = {
@@ -4840,49 +4842,56 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
                args->bc_attrs.max_reqs);
 }
 
-static int _verify_channel_attr(char *chan, char *attr_name, u32 sent, u32 rcvd)
+static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
 {
-       if (rcvd <= sent)
-               return 0;
-       printk(KERN_WARNING "%s: Session INVALID: %s channel %s increased. "
-               "sent=%u rcvd=%u\n", __func__, chan, attr_name, sent, rcvd);
-       return -EINVAL;
+       struct nfs4_channel_attrs *sent = &args->fc_attrs;
+       struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
+
+       if (rcvd->headerpadsz > sent->headerpadsz)
+               return -EINVAL;
+       if (rcvd->max_resp_sz > sent->max_resp_sz)
+               return -EINVAL;
+       /*
+        * Our requested max_ops is the minimum we need; we're not
+        * prepared to break up compounds into smaller pieces than that.
+        * So, no point even trying to continue if the server won't
+        * cooperate:
+        */
+       if (rcvd->max_ops < sent->max_ops)
+               return -EINVAL;
+       if (rcvd->max_reqs == 0)
+               return -EINVAL;
+       return 0;
 }
 
-#define _verify_fore_channel_attr(_name_) \
-       _verify_channel_attr("fore", #_name_, \
-                            args->fc_attrs._name_, \
-                            session->fc_attrs._name_)
+static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
+{
+       struct nfs4_channel_attrs *sent = &args->bc_attrs;
+       struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
 
-#define _verify_back_channel_attr(_name_) \
-       _verify_channel_attr("back", #_name_, \
-                            args->bc_attrs._name_, \
-                            session->bc_attrs._name_)
+       if (rcvd->max_rqst_sz > sent->max_rqst_sz)
+               return -EINVAL;
+       if (rcvd->max_resp_sz < sent->max_resp_sz)
+               return -EINVAL;
+       if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
+               return -EINVAL;
+       /* These would render the backchannel useless: */
+       if (rcvd->max_ops  == 0)
+               return -EINVAL;
+       if (rcvd->max_reqs == 0)
+               return -EINVAL;
+       return 0;
+}
 
-/*
- * The server is not allowed to increase the fore channel header pad size,
- * maximum response size, or maximum number of operations.
- *
- * The back channel attributes are only negotiatied down: We send what the
- * (back channel) server insists upon.
- */
 static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
                                     struct nfs4_session *session)
 {
-       int ret = 0;
-
-       ret |= _verify_fore_channel_attr(headerpadsz);
-       ret |= _verify_fore_channel_attr(max_resp_sz);
-       ret |= _verify_fore_channel_attr(max_ops);
-
-       ret |= _verify_back_channel_attr(headerpadsz);
-       ret |= _verify_back_channel_attr(max_rqst_sz);
-       ret |= _verify_back_channel_attr(max_resp_sz);
-       ret |= _verify_back_channel_attr(max_resp_sz_cached);
-       ret |= _verify_back_channel_attr(max_ops);
-       ret |= _verify_back_channel_attr(max_reqs);
+       int ret;
 
-       return ret;
+       ret = nfs4_verify_fore_channel_attrs(args, session);
+       if (ret)
+               return ret;
+       return nfs4_verify_back_channel_attrs(args, session);
 }
 
 static int _nfs4_proc_create_session(struct nfs_client *clp)
@@ -5255,6 +5264,147 @@ out:
        dprintk("<-- %s status=%d\n", __func__, status);
        return status;
 }
+
+static void
+nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
+{
+       struct nfs4_layoutget *lgp = calldata;
+       struct inode *ino = lgp->args.inode;
+       struct nfs_server *server = NFS_SERVER(ino);
+
+       dprintk("--> %s\n", __func__);
+       if (nfs4_setup_sequence(server, &lgp->args.seq_args,
+                               &lgp->res.seq_res, 0, task))
+               return;
+       rpc_call_start(task);
+}
+
+static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
+{
+       struct nfs4_layoutget *lgp = calldata;
+       struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+
+       dprintk("--> %s\n", __func__);
+
+       if (!nfs4_sequence_done(task, &lgp->res.seq_res))
+               return;
+
+       switch (task->tk_status) {
+       case 0:
+               break;
+       case -NFS4ERR_LAYOUTTRYLATER:
+       case -NFS4ERR_RECALLCONFLICT:
+               task->tk_status = -NFS4ERR_DELAY;
+               /* Fall through */
+       default:
+               if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+                       rpc_restart_call_prepare(task);
+                       return;
+               }
+       }
+       lgp->status = task->tk_status;
+       dprintk("<-- %s\n", __func__);
+}
+
+static void nfs4_layoutget_release(void *calldata)
+{
+       struct nfs4_layoutget *lgp = calldata;
+
+       dprintk("--> %s\n", __func__);
+       put_layout_hdr(lgp->args.inode);
+       if (lgp->res.layout.buf != NULL)
+               free_page((unsigned long) lgp->res.layout.buf);
+       put_nfs_open_context(lgp->args.ctx);
+       kfree(calldata);
+       dprintk("<-- %s\n", __func__);
+}
+
+static const struct rpc_call_ops nfs4_layoutget_call_ops = {
+       .rpc_call_prepare = nfs4_layoutget_prepare,
+       .rpc_call_done = nfs4_layoutget_done,
+       .rpc_release = nfs4_layoutget_release,
+};
+
+int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
+{
+       struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+       struct rpc_task *task;
+       struct rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
+               .rpc_argp = &lgp->args,
+               .rpc_resp = &lgp->res,
+       };
+       struct rpc_task_setup task_setup_data = {
+               .rpc_client = server->client,
+               .rpc_message = &msg,
+               .callback_ops = &nfs4_layoutget_call_ops,
+               .callback_data = lgp,
+               .flags = RPC_TASK_ASYNC,
+       };
+       int status = 0;
+
+       dprintk("--> %s\n", __func__);
+
+       lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS);
+       if (lgp->res.layout.buf == NULL) {
+               nfs4_layoutget_release(lgp);
+               return -ENOMEM;
+       }
+
+       lgp->res.seq_res.sr_slot = NULL;
+       task = rpc_run_task(&task_setup_data);
+       if (IS_ERR(task))
+               return PTR_ERR(task);
+       status = nfs4_wait_for_completion_rpc_task(task);
+       if (status != 0)
+               goto out;
+       status = lgp->status;
+       if (status != 0)
+               goto out;
+       status = pnfs_layout_process(lgp);
+out:
+       rpc_put_task(task);
+       dprintk("<-- %s status=%d\n", __func__, status);
+       return status;
+}
+
+static int
+_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+{
+       struct nfs4_getdeviceinfo_args args = {
+               .pdev = pdev,
+       };
+       struct nfs4_getdeviceinfo_res res = {
+               .pdev = pdev,
+       };
+       struct rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
+               .rpc_argp = &args,
+               .rpc_resp = &res,
+       };
+       int status;
+
+       dprintk("--> %s\n", __func__);
+       status = nfs4_call_sync(server, &msg, &args, &res, 0);
+       dprintk("<-- %s status=%d\n", __func__, status);
+
+       return status;
+}
+
+int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+{
+       struct nfs4_exception exception = { };
+       int err;
+
+       do {
+               err = nfs4_handle_exception(server,
+                                       _nfs4_proc_getdeviceinfo(server, pdev),
+                                       &exception);
+       } while (exception.retry);
+       return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
+
 #endif /* CONFIG_NFS_V4_1 */
 
 struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
index aa0b02a610c4d7c797691896c7e3b4c3eb35b682..f575a3126737054efa695db149c9c0f08571793d 100644 (file)
@@ -54,6 +54,7 @@
 #include "callback.h"
 #include "delegation.h"
 #include "internal.h"
+#include "pnfs.h"
 
 #define OPENOWNER_POOL_SIZE    8
 
@@ -1475,6 +1476,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
                        }
                        clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
                        set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+                       pnfs_destroy_all_layouts(clp);
                }
 
                if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
index bd2101d918c80cda6d6f21b03f696e5312803676..f313c4cce7e4d1d087b8a1b47b55f769edf40a21 100644 (file)
@@ -52,6 +52,7 @@
 #include <linux/nfs_idmap.h>
 #include "nfs4_fs.h"
 #include "internal.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY                NFSDBG_XDR
 
@@ -310,6 +311,19 @@ static int nfs4_stat_to_errno(int);
                                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
 #define encode_reclaim_complete_maxsz  (op_encode_hdr_maxsz + 4)
 #define decode_reclaim_complete_maxsz  (op_decode_hdr_maxsz + 4)
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
+                               XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
+#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
+                               1 /* layout type */ + \
+                               1 /* opaque devaddr4 length */ + \
+                                 /* devaddr4 payload is read into page */ \
+                               1 /* notification bitmap length */ + \
+                               1 /* notification bitmap */)
+#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
+                               encode_stateid_maxsz)
+#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
+                               decode_stateid_maxsz + \
+                               XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz  0
 #define decode_sequence_maxsz  0
@@ -699,6 +713,20 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_reclaim_complete_sz   (compound_decode_hdr_maxsz + \
                                         decode_sequence_maxsz + \
                                         decode_reclaim_complete_maxsz)
+#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
+                               encode_sequence_maxsz +\
+                               encode_getdeviceinfo_maxsz)
+#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz +    \
+                               decode_sequence_maxsz + \
+                               decode_getdeviceinfo_maxsz)
+#define NFS4_enc_layoutget_sz  (compound_encode_hdr_maxsz + \
+                               encode_sequence_maxsz + \
+                               encode_putfh_maxsz +        \
+                               encode_layoutget_maxsz)
+#define NFS4_dec_layoutget_sz  (compound_decode_hdr_maxsz + \
+                               decode_sequence_maxsz + \
+                               decode_putfh_maxsz +        \
+                               decode_layoutget_maxsz)
 
 const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
                                      compound_encode_hdr_maxsz +
@@ -1737,6 +1765,58 @@ static void encode_sequence(struct xdr_stream *xdr,
 #endif /* CONFIG_NFS_V4_1 */
 }
 
+#ifdef CONFIG_NFS_V4_1
+static void
+encode_getdeviceinfo(struct xdr_stream *xdr,
+                    const struct nfs4_getdeviceinfo_args *args,
+                    struct compound_hdr *hdr)
+{
+       __be32 *p;
+
+       p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE);
+       *p++ = cpu_to_be32(OP_GETDEVICEINFO);
+       p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
+                                   NFS4_DEVICEID4_SIZE);
+       *p++ = cpu_to_be32(args->pdev->layout_type);
+       *p++ = cpu_to_be32(args->pdev->pglen);          /* gdia_maxcount */
+       *p++ = cpu_to_be32(0);                          /* bitmap length 0 */
+       hdr->nops++;
+       hdr->replen += decode_getdeviceinfo_maxsz;
+}
+
+static void
+encode_layoutget(struct xdr_stream *xdr,
+                     const struct nfs4_layoutget_args *args,
+                     struct compound_hdr *hdr)
+{
+       nfs4_stateid stateid;
+       __be32 *p;
+
+       p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
+       *p++ = cpu_to_be32(OP_LAYOUTGET);
+       *p++ = cpu_to_be32(0);     /* Signal layout available */
+       *p++ = cpu_to_be32(args->type);
+       *p++ = cpu_to_be32(args->range.iomode);
+       p = xdr_encode_hyper(p, args->range.offset);
+       p = xdr_encode_hyper(p, args->range.length);
+       p = xdr_encode_hyper(p, args->minlength);
+       pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout,
+                               args->ctx->state);
+       p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE);
+       *p = cpu_to_be32(args->maxcount);
+
+       dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",
+               __func__,
+               args->type,
+               args->range.iomode,
+               (unsigned long)args->range.offset,
+               (unsigned long)args->range.length,
+               args->maxcount);
+       hdr->nops++;
+       hdr->replen += decode_layoutget_maxsz;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 /*
  * END OF "GENERIC" ENCODE ROUTINES.
  */
@@ -2554,6 +2634,51 @@ static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p,
        return 0;
 }
 
+/*
+ * Encode GETDEVICEINFO request
+ */
+static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,
+                                     struct nfs4_getdeviceinfo_args *args)
+{
+       struct xdr_stream xdr;
+       struct compound_hdr hdr = {
+               .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+       };
+
+       xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+       encode_compound_hdr(&xdr, req, &hdr);
+       encode_sequence(&xdr, &args->seq_args, &hdr);
+       encode_getdeviceinfo(&xdr, args, &hdr);
+
+       /* set up reply kvec. Subtract notification bitmap max size (2)
+        * so that notification bitmap is put in xdr_buf tail */
+       xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2,
+                        args->pdev->pages, args->pdev->pgbase,
+                        args->pdev->pglen);
+
+       encode_nops(&hdr);
+       return 0;
+}
+
+/*
+ *  Encode LAYOUTGET request
+ */
+static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p,
+                                 struct nfs4_layoutget_args *args)
+{
+       struct xdr_stream xdr;
+       struct compound_hdr hdr = {
+               .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+       };
+
+       xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+       encode_compound_hdr(&xdr, req, &hdr);
+       encode_sequence(&xdr, &args->seq_args, &hdr);
+       encode_putfh(&xdr, NFS_FH(args->inode), &hdr);
+       encode_layoutget(&xdr, args, &hdr);
+       encode_nops(&hdr);
+       return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -3978,6 +4103,61 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
        return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
 }
 
+/*
+ * Decode potentially multiple layout types. Currently we only support
+ * one layout driver per file system.
+ */
+static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
+                                        uint32_t *layouttype)
+{
+       uint32_t *p;
+       int num;
+
+       p = xdr_inline_decode(xdr, 4);
+       if (unlikely(!p))
+               goto out_overflow;
+       num = be32_to_cpup(p);
+
+       /* pNFS is not supported by the underlying file system */
+       if (num == 0) {
+               *layouttype = 0;
+               return 0;
+       }
+       if (num > 1)
+               printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers "
+                       "per filesystem not supported\n", __func__);
+
+       /* Decode and set first layout type, move xdr->p past unused types */
+       p = xdr_inline_decode(xdr, num * 4);
+       if (unlikely(!p))
+               goto out_overflow;
+       *layouttype = be32_to_cpup(p);
+       return 0;
+out_overflow:
+       print_overflow_msg(__func__, xdr);
+       return -EIO;
+}
+
+/*
+ * The type of file system exported.
+ * Note we must ensure that layouttype is set in any non-error case.
+ */
+static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
+                               uint32_t *layouttype)
+{
+       int status = 0;
+
+       dprintk("%s: bitmap is %x\n", __func__, bitmap[1]);
+       if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
+               return -EIO;
+       if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
+               status = decode_first_pnfs_layout_type(xdr, layouttype);
+               bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
+       } else
+               *layouttype = 0;
+       return status;
+}
+
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
        __be32 *savep;
@@ -4004,6 +4184,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
                goto xdr_error;
        fsinfo->wtpref = fsinfo->wtmax;
        status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
+       if (status != 0)
+               goto xdr_error;
+       status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
        if (status != 0)
                goto xdr_error;
 
@@ -4772,6 +4955,134 @@ out_overflow:
 #endif /* CONFIG_NFS_V4_1 */
 }
 
+#if defined(CONFIG_NFS_V4_1)
+
+static int decode_getdeviceinfo(struct xdr_stream *xdr,
+                               struct pnfs_device *pdev)
+{
+       __be32 *p;
+       uint32_t len, type;
+       int status;
+
+       status = decode_op_hdr(xdr, OP_GETDEVICEINFO);
+       if (status) {
+               if (status == -ETOOSMALL) {
+                       p = xdr_inline_decode(xdr, 4);
+                       if (unlikely(!p))
+                               goto out_overflow;
+                       pdev->mincount = be32_to_cpup(p);
+                       dprintk("%s: Min count too small. mincnt = %u\n",
+                               __func__, pdev->mincount);
+               }
+               return status;
+       }
+
+       p = xdr_inline_decode(xdr, 8);
+       if (unlikely(!p))
+               goto out_overflow;
+       type = be32_to_cpup(p++);
+       if (type != pdev->layout_type) {
+               dprintk("%s: layout mismatch req: %u pdev: %u\n",
+                       __func__, pdev->layout_type, type);
+               return -EINVAL;
+       }
+       /*
+        * Get the length of the opaque device_addr4. xdr_read_pages places
+        * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages)
+        * and places the remaining xdr data in xdr_buf->tail
+        */
+       pdev->mincount = be32_to_cpup(p);
+       xdr_read_pages(xdr, pdev->mincount); /* include space for the length */
+
+       /* Parse notification bitmap, verifying that it is zero. */
+       p = xdr_inline_decode(xdr, 4);
+       if (unlikely(!p))
+               goto out_overflow;
+       len = be32_to_cpup(p);
+       if (len) {
+               int i;
+
+               p = xdr_inline_decode(xdr, 4 * len);
+               if (unlikely(!p))
+                       goto out_overflow;
+               for (i = 0; i < len; i++, p++) {
+                       if (be32_to_cpup(p)) {
+                               dprintk("%s: notifications not supported\n",
+                                       __func__);
+                               return -EIO;
+                       }
+               }
+       }
+       return 0;
+out_overflow:
+       print_overflow_msg(__func__, xdr);
+       return -EIO;
+}
+
+static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
+                           struct nfs4_layoutget_res *res)
+{
+       __be32 *p;
+       int status;
+       u32 layout_count;
+
+       status = decode_op_hdr(xdr, OP_LAYOUTGET);
+       if (status)
+               return status;
+       p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE);
+       if (unlikely(!p))
+               goto out_overflow;
+       res->return_on_close = be32_to_cpup(p++);
+       p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);
+       layout_count = be32_to_cpup(p);
+       if (!layout_count) {
+               dprintk("%s: server responded with empty layout array\n",
+                       __func__);
+               return -EINVAL;
+       }
+
+       p = xdr_inline_decode(xdr, 24);
+       if (unlikely(!p))
+               goto out_overflow;
+       p = xdr_decode_hyper(p, &res->range.offset);
+       p = xdr_decode_hyper(p, &res->range.length);
+       res->range.iomode = be32_to_cpup(p++);
+       res->type = be32_to_cpup(p++);
+
+       status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
+       if (unlikely(status))
+               return status;
+
+       dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
+               __func__,
+               (unsigned long)res->range.offset,
+               (unsigned long)res->range.length,
+               res->range.iomode,
+               res->type,
+               res->layout.len);
+
+       /* nfs4_proc_layoutget allocated a single page */
+       if (res->layout.len > PAGE_SIZE)
+               return -ENOMEM;
+       memcpy(res->layout.buf, p, res->layout.len);
+
+       if (layout_count > 1) {
+               /* We only handle a length one array at the moment.  Any
+                * further entries are just ignored.  Note that this means
+                * the client may see a response that is less than the
+                * minimum it requested.
+                */
+               dprintk("%s: server responded with %d layouts, dropping tail\n",
+                       __func__, layout_count);
+       }
+
+       return 0;
+out_overflow:
+       print_overflow_msg(__func__, xdr);
+       return -EIO;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 /*
  * END OF "GENERIC" DECODE ROUTINES.
  */
@@ -5799,6 +6110,53 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p,
                status = decode_reclaim_complete(&xdr, (void *)NULL);
        return status;
 }
+
+/*
+ * Decode GETDEVINFO response
+ */
+static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p,
+                                     struct nfs4_getdeviceinfo_res *res)
+{
+       struct xdr_stream xdr;
+       struct compound_hdr hdr;
+       int status;
+
+       xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+       status = decode_compound_hdr(&xdr, &hdr);
+       if (status != 0)
+               goto out;
+       status = decode_sequence(&xdr, &res->seq_res, rqstp);
+       if (status != 0)
+               goto out;
+       status = decode_getdeviceinfo(&xdr, res->pdev);
+out:
+       return status;
+}
+
+/*
+ * Decode LAYOUTGET response
+ */
+static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p,
+                                 struct nfs4_layoutget_res *res)
+{
+       struct xdr_stream xdr;
+       struct compound_hdr hdr;
+       int status;
+
+       xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+       status = decode_compound_hdr(&xdr, &hdr);
+       if (status)
+               goto out;
+       status = decode_sequence(&xdr, &res->seq_res, rqstp);
+       if (status)
+               goto out;
+       status = decode_putfh(&xdr);
+       if (status)
+               goto out;
+       status = decode_layoutget(&xdr, rqstp, res);
+out:
+       return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
@@ -5990,6 +6348,8 @@ struct rpc_procinfo       nfs4_procedures[] = {
   PROC(SEQUENCE,       enc_sequence,   dec_sequence),
   PROC(GET_LEASE_TIME, enc_get_lease_time,     dec_get_lease_time),
   PROC(RECLAIM_COMPLETE, enc_reclaim_complete,  dec_reclaim_complete),
+  PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
+  PROC(LAYOUTGET,  enc_layoutget,     dec_layoutget),
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
new file mode 100644 (file)
index 0000000..db77342
--- /dev/null
@@ -0,0 +1,783 @@
+/*
+ *  pNFS functions to call and manage layout drivers.
+ *
+ *  Copyright (c) 2002 [year of first publication]
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#include <linux/nfs_fs.h>
+#include "internal.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY                NFSDBG_PNFS
+
+/* Locking:
+ *
+ * pnfs_spinlock:
+ *      protects pnfs_modules_tbl.
+ */
+static DEFINE_SPINLOCK(pnfs_spinlock);
+
+/*
+ * pnfs_modules_tbl holds all pnfs modules
+ */
+static LIST_HEAD(pnfs_modules_tbl);
+
+/* Return the registered pnfs layout driver module matching given id */
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver_locked(u32 id)
+{
+       struct pnfs_layoutdriver_type *local;
+
+       list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
+               if (local->id == id)
+                       goto out;
+       local = NULL;
+out:
+       dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
+       return local;
+}
+
+static struct pnfs_layoutdriver_type *
+find_pnfs_driver(u32 id)
+{
+       struct pnfs_layoutdriver_type *local;
+
+       spin_lock(&pnfs_spinlock);
+       local = find_pnfs_driver_locked(id);
+       spin_unlock(&pnfs_spinlock);
+       return local;
+}
+
+void
+unset_pnfs_layoutdriver(struct nfs_server *nfss)
+{
+       if (nfss->pnfs_curr_ld) {
+               nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
+               module_put(nfss->pnfs_curr_ld->owner);
+       }
+       nfss->pnfs_curr_ld = NULL;
+}
+
+/*
+ * Try to set the server's pnfs module to the pnfs layout type specified by id.
+ * Currently only one pNFS layout driver per filesystem is supported.
+ *
+ * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
+ */
+void
+set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
+{
+       struct pnfs_layoutdriver_type *ld_type = NULL;
+
+       if (id == 0)
+               goto out_no_driver;
+       if (!(server->nfs_client->cl_exchange_flags &
+                (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
+               printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
+                      id, server->nfs_client->cl_exchange_flags);
+               goto out_no_driver;
+       }
+       ld_type = find_pnfs_driver(id);
+       if (!ld_type) {
+               request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
+               ld_type = find_pnfs_driver(id);
+               if (!ld_type) {
+                       dprintk("%s: No pNFS module found for %u.\n",
+                               __func__, id);
+                       goto out_no_driver;
+               }
+       }
+       if (!try_module_get(ld_type->owner)) {
+               dprintk("%s: Could not grab reference on module\n", __func__);
+               goto out_no_driver;
+       }
+       server->pnfs_curr_ld = ld_type;
+       if (ld_type->set_layoutdriver(server)) {
+               printk(KERN_ERR
+                      "%s: Error initializing mount point for layout driver %u.\n",
+                      __func__, id);
+               module_put(ld_type->owner);
+               goto out_no_driver;
+       }
+       dprintk("%s: pNFS module for %u set\n", __func__, id);
+       return;
+
+out_no_driver:
+       dprintk("%s: Using NFSv4 I/O\n", __func__);
+       server->pnfs_curr_ld = NULL;
+}
+
+int
+pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+       int status = -EINVAL;
+       struct pnfs_layoutdriver_type *tmp;
+
+       if (ld_type->id == 0) {
+               printk(KERN_ERR "%s id 0 is reserved\n", __func__);
+               return status;
+       }
+       if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
+               printk(KERN_ERR "%s Layout driver must provide "
+                      "alloc_lseg and free_lseg.\n", __func__);
+               return status;
+       }
+
+       spin_lock(&pnfs_spinlock);
+       tmp = find_pnfs_driver_locked(ld_type->id);
+       if (!tmp) {
+               list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
+               status = 0;
+               dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
+                       ld_type->name);
+       } else {
+               printk(KERN_ERR "%s Module with id %d already loaded!\n",
+                       __func__, ld_type->id);
+       }
+       spin_unlock(&pnfs_spinlock);
+
+       return status;
+}
+EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
+
+void
+pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
+{
+       dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
+       spin_lock(&pnfs_spinlock);
+       list_del(&ld_type->pnfs_tblid);
+       spin_unlock(&pnfs_spinlock);
+}
+EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
+
+/*
+ * pNFS client layout cache
+ */
+
+static void
+get_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+{
+       assert_spin_locked(&lo->inode->i_lock);
+       lo->refcount++;
+}
+
+static void
+put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
+{
+       assert_spin_locked(&lo->inode->i_lock);
+       BUG_ON(lo->refcount == 0);
+
+       lo->refcount--;
+       if (!lo->refcount) {
+               dprintk("%s: freeing layout cache %p\n", __func__, lo);
+               BUG_ON(!list_empty(&lo->layouts));
+               NFS_I(lo->inode)->layout = NULL;
+               kfree(lo);
+       }
+}
+
+void
+put_layout_hdr(struct inode *inode)
+{
+       spin_lock(&inode->i_lock);
+       put_layout_hdr_locked(NFS_I(inode)->layout);
+       spin_unlock(&inode->i_lock);
+}
+
+static void
+init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
+{
+       INIT_LIST_HEAD(&lseg->fi_list);
+       kref_init(&lseg->kref);
+       lseg->layout = lo;
+}
+
+/* Called without i_lock held, as the free_lseg call may sleep */
+static void
+destroy_lseg(struct kref *kref)
+{
+       struct pnfs_layout_segment *lseg =
+               container_of(kref, struct pnfs_layout_segment, kref);
+       struct inode *ino = lseg->layout->inode;
+
+       dprintk("--> %s\n", __func__);
+       NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+       /* Matched by get_layout_hdr_locked in pnfs_insert_layout */
+       put_layout_hdr(ino);
+}
+
+static void
+put_lseg(struct pnfs_layout_segment *lseg)
+{
+       if (!lseg)
+               return;
+
+       dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+               atomic_read(&lseg->kref.refcount));
+       kref_put(&lseg->kref, destroy_lseg);
+}
+
+static void
+pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list)
+{
+       struct pnfs_layout_segment *lseg, *next;
+       struct nfs_client *clp;
+
+       dprintk("%s:Begin lo %p\n", __func__, lo);
+
+       assert_spin_locked(&lo->inode->i_lock);
+       list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) {
+               dprintk("%s: freeing lseg %p\n", __func__, lseg);
+               list_move(&lseg->fi_list, tmp_list);
+       }
+       clp = NFS_SERVER(lo->inode)->nfs_client;
+       spin_lock(&clp->cl_lock);
+       /* List does not take a reference, so no need for put here */
+       list_del_init(&lo->layouts);
+       spin_unlock(&clp->cl_lock);
+       write_seqlock(&lo->seqlock);
+       clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+       write_sequnlock(&lo->seqlock);
+
+       dprintk("%s:Return\n", __func__);
+}
+
+static void
+pnfs_free_lseg_list(struct list_head *tmp_list)
+{
+       struct pnfs_layout_segment *lseg;
+
+       while (!list_empty(tmp_list)) {
+               lseg = list_entry(tmp_list->next, struct pnfs_layout_segment,
+                               fi_list);
+               dprintk("%s calling put_lseg on %p\n", __func__, lseg);
+               list_del(&lseg->fi_list);
+               put_lseg(lseg);
+       }
+}
+
+void
+pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+       struct pnfs_layout_hdr *lo;
+       LIST_HEAD(tmp_list);
+
+       spin_lock(&nfsi->vfs_inode.i_lock);
+       lo = nfsi->layout;
+       if (lo) {
+               pnfs_clear_lseg_list(lo, &tmp_list);
+               /* Matched by refcount set to 1 in alloc_init_layout_hdr */
+               put_layout_hdr_locked(lo);
+       }
+       spin_unlock(&nfsi->vfs_inode.i_lock);
+       pnfs_free_lseg_list(&tmp_list);
+}
+
+/*
+ * Called by the state manger to remove all layouts established under an
+ * expired lease.
+ */
+void
+pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+       struct pnfs_layout_hdr *lo;
+       LIST_HEAD(tmp_list);
+
+       spin_lock(&clp->cl_lock);
+       list_splice_init(&clp->cl_layouts, &tmp_list);
+       spin_unlock(&clp->cl_lock);
+
+       while (!list_empty(&tmp_list)) {
+               lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
+                               layouts);
+               dprintk("%s freeing layout for inode %lu\n", __func__,
+                       lo->inode->i_ino);
+               pnfs_destroy_layout(NFS_I(lo->inode));
+       }
+}
+
+/* update lo->stateid with new if is more recent
+ *
+ * lo->stateid could be the open stateid, in which case we just use what given.
+ */
+static void
+pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
+                       const nfs4_stateid *new)
+{
+       nfs4_stateid *old = &lo->stateid;
+       bool overwrite = false;
+
+       write_seqlock(&lo->seqlock);
+       if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) ||
+           memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other)))
+               overwrite = true;
+       else {
+               u32 oldseq, newseq;
+
+               oldseq = be32_to_cpu(old->stateid.seqid);
+               newseq = be32_to_cpu(new->stateid.seqid);
+               if ((int)(newseq - oldseq) > 0)
+                       overwrite = true;
+       }
+       if (overwrite)
+               memcpy(&old->stateid, &new->stateid, sizeof(new->stateid));
+       write_sequnlock(&lo->seqlock);
+}
+
+static void
+pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo,
+                             struct nfs4_state *state)
+{
+       int seq;
+
+       dprintk("--> %s\n", __func__);
+       write_seqlock(&lo->seqlock);
+       do {
+               seq = read_seqbegin(&state->seqlock);
+               memcpy(lo->stateid.data, state->stateid.data,
+                      sizeof(state->stateid.data));
+       } while (read_seqretry(&state->seqlock, seq));
+       set_bit(NFS_LAYOUT_STATEID_SET, &lo->state);
+       write_sequnlock(&lo->seqlock);
+       dprintk("<-- %s\n", __func__);
+}
+
+void
+pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+                       struct nfs4_state *open_state)
+{
+       int seq;
+
+       dprintk("--> %s\n", __func__);
+       do {
+               seq = read_seqbegin(&lo->seqlock);
+               if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) {
+                       /* This will trigger retry of the read */
+                       pnfs_layout_from_open_stateid(lo, open_state);
+               } else
+                       memcpy(dst->data, lo->stateid.data,
+                              sizeof(lo->stateid.data));
+       } while (read_seqretry(&lo->seqlock, seq));
+       dprintk("<-- %s\n", __func__);
+}
+
+/*
+* Get layout from server.
+*    for now, assume that whole file layouts are requested.
+*    arg->offset: 0
+*    arg->length: all ones
+*/
+static struct pnfs_layout_segment *
+send_layoutget(struct pnfs_layout_hdr *lo,
+          struct nfs_open_context *ctx,
+          u32 iomode)
+{
+       struct inode *ino = lo->inode;
+       struct nfs_server *server = NFS_SERVER(ino);
+       struct nfs4_layoutget *lgp;
+       struct pnfs_layout_segment *lseg = NULL;
+
+       dprintk("--> %s\n", __func__);
+
+       BUG_ON(ctx == NULL);
+       lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
+       if (lgp == NULL) {
+               put_layout_hdr(lo->inode);
+               return NULL;
+       }
+       lgp->args.minlength = NFS4_MAX_UINT64;
+       lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+       lgp->args.range.iomode = iomode;
+       lgp->args.range.offset = 0;
+       lgp->args.range.length = NFS4_MAX_UINT64;
+       lgp->args.type = server->pnfs_curr_ld->id;
+       lgp->args.inode = ino;
+       lgp->args.ctx = get_nfs_open_context(ctx);
+       lgp->lsegpp = &lseg;
+
+       /* Synchronously retrieve layout information from server and
+        * store in lseg.
+        */
+       nfs4_proc_layoutget(lgp);
+       if (!lseg) {
+               /* remember that LAYOUTGET failed and suspend trying */
+               set_bit(lo_fail_bit(iomode), &lo->state);
+       }
+       return lseg;
+}
+
+/*
+ * Compare two layout segments for sorting into layout cache.
+ * We want to preferentially return RW over RO layouts, so ensure those
+ * are seen first.
+ */
+static s64
+cmp_layout(u32 iomode1, u32 iomode2)
+{
+       /* read > read/write */
+       return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
+}
+
+static void
+pnfs_insert_layout(struct pnfs_layout_hdr *lo,
+                  struct pnfs_layout_segment *lseg)
+{
+       struct pnfs_layout_segment *lp;
+       int found = 0;
+
+       dprintk("%s:Begin\n", __func__);
+
+       assert_spin_locked(&lo->inode->i_lock);
+       if (list_empty(&lo->segs)) {
+               struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client;
+
+               spin_lock(&clp->cl_lock);
+               BUG_ON(!list_empty(&lo->layouts));
+               list_add_tail(&lo->layouts, &clp->cl_layouts);
+               spin_unlock(&clp->cl_lock);
+       }
+       list_for_each_entry(lp, &lo->segs, fi_list) {
+               if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0)
+                       continue;
+               list_add_tail(&lseg->fi_list, &lp->fi_list);
+               dprintk("%s: inserted lseg %p "
+                       "iomode %d offset %llu length %llu before "
+                       "lp %p iomode %d offset %llu length %llu\n",
+                       __func__, lseg, lseg->range.iomode,
+                       lseg->range.offset, lseg->range.length,
+                       lp, lp->range.iomode, lp->range.offset,
+                       lp->range.length);
+               found = 1;
+               break;
+       }
+       if (!found) {
+               list_add_tail(&lseg->fi_list, &lo->segs);
+               dprintk("%s: inserted lseg %p "
+                       "iomode %d offset %llu length %llu at tail\n",
+                       __func__, lseg, lseg->range.iomode,
+                       lseg->range.offset, lseg->range.length);
+       }
+       get_layout_hdr_locked(lo);
+
+       dprintk("%s:Return\n", __func__);
+}
+
+static struct pnfs_layout_hdr *
+alloc_init_layout_hdr(struct inode *ino)
+{
+       struct pnfs_layout_hdr *lo;
+
+       lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
+       if (!lo)
+               return NULL;
+       lo->refcount = 1;
+       INIT_LIST_HEAD(&lo->layouts);
+       INIT_LIST_HEAD(&lo->segs);
+       seqlock_init(&lo->seqlock);
+       lo->inode = ino;
+       return lo;
+}
+
+static struct pnfs_layout_hdr *
+pnfs_find_alloc_layout(struct inode *ino)
+{
+       struct nfs_inode *nfsi = NFS_I(ino);
+       struct pnfs_layout_hdr *new = NULL;
+
+       dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
+
+       assert_spin_locked(&ino->i_lock);
+       if (nfsi->layout)
+               return nfsi->layout;
+
+       spin_unlock(&ino->i_lock);
+       new = alloc_init_layout_hdr(ino);
+       spin_lock(&ino->i_lock);
+
+       if (likely(nfsi->layout == NULL))       /* Won the race? */
+               nfsi->layout = new;
+       else
+               kfree(new);
+       return nfsi->layout;
+}
+
+/*
+ * iomode matching rules:
+ * iomode      lseg    match
+ * -----       -----   -----
+ * ANY         READ    true
+ * ANY         RW      true
+ * RW          READ    false
+ * RW          RW      true
+ * READ                READ    true
+ * READ                RW      true
+ */
+static int
+is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
+{
+       return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW);
+}
+
+/*
+ * lookup range in layout
+ */
+static struct pnfs_layout_segment *
+pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)
+{
+       struct pnfs_layout_segment *lseg, *ret = NULL;
+
+       dprintk("%s:Begin\n", __func__);
+
+       assert_spin_locked(&lo->inode->i_lock);
+       list_for_each_entry(lseg, &lo->segs, fi_list) {
+               if (is_matching_lseg(lseg, iomode)) {
+                       ret = lseg;
+                       break;
+               }
+               if (cmp_layout(iomode, lseg->range.iomode) > 0)
+                       break;
+       }
+
+       dprintk("%s:Return lseg %p ref %d\n",
+               __func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0);
+       return ret;
+}
+
+/*
+ * Layout segment is retreived from the server if not cached.
+ * The appropriate layout segment is referenced and returned to the caller.
+ */
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino,
+                  struct nfs_open_context *ctx,
+                  enum pnfs_iomode iomode)
+{
+       struct nfs_inode *nfsi = NFS_I(ino);
+       struct pnfs_layout_hdr *lo;
+       struct pnfs_layout_segment *lseg = NULL;
+
+       if (!pnfs_enabled_sb(NFS_SERVER(ino)))
+               return NULL;
+       spin_lock(&ino->i_lock);
+       lo = pnfs_find_alloc_layout(ino);
+       if (lo == NULL) {
+               dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
+               goto out_unlock;
+       }
+
+       /* Check to see if the layout for the given range already exists */
+       lseg = pnfs_has_layout(lo, iomode);
+       if (lseg) {
+               dprintk("%s: Using cached lseg %p for iomode %d)\n",
+                       __func__, lseg, iomode);
+               goto out_unlock;
+       }
+
+       /* if LAYOUTGET already failed once we don't try again */
+       if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state))
+               goto out_unlock;
+
+       get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */
+       spin_unlock(&ino->i_lock);
+
+       lseg = send_layoutget(lo, ctx, iomode);
+out:
+       dprintk("%s end, state 0x%lx lseg %p\n", __func__,
+               nfsi->layout->state, lseg);
+       return lseg;
+out_unlock:
+       spin_unlock(&ino->i_lock);
+       goto out;
+}
+
+int
+pnfs_layout_process(struct nfs4_layoutget *lgp)
+{
+       struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
+       struct nfs4_layoutget_res *res = &lgp->res;
+       struct pnfs_layout_segment *lseg;
+       struct inode *ino = lo->inode;
+       int status = 0;
+
+       /* Inject layout blob into I/O device driver */
+       lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
+       if (!lseg || IS_ERR(lseg)) {
+               if (!lseg)
+                       status = -ENOMEM;
+               else
+                       status = PTR_ERR(lseg);
+               dprintk("%s: Could not allocate layout: error %d\n",
+                      __func__, status);
+               goto out;
+       }
+
+       spin_lock(&ino->i_lock);
+       init_lseg(lo, lseg);
+       lseg->range = res->range;
+       *lgp->lsegpp = lseg;
+       pnfs_insert_layout(lo, lseg);
+
+       /* Done processing layoutget. Set the layout stateid */
+       pnfs_set_layout_stateid(lo, &res->stateid);
+       spin_unlock(&ino->i_lock);
+out:
+       return status;
+}
+
+/*
+ * Device ID cache. Currently supports one layout type per struct nfs_client.
+ * Add layout type to the lookup key to expand to support multiple types.
+ */
+int
+pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
+                        void (*free_callback)(struct pnfs_deviceid_node *))
+{
+       struct pnfs_deviceid_cache *c;
+
+       c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
+       if (!c)
+               return -ENOMEM;
+       spin_lock(&clp->cl_lock);
+       if (clp->cl_devid_cache != NULL) {
+               atomic_inc(&clp->cl_devid_cache->dc_ref);
+               dprintk("%s [kref [%d]]\n", __func__,
+                       atomic_read(&clp->cl_devid_cache->dc_ref));
+               kfree(c);
+       } else {
+               /* kzalloc initializes hlists */
+               spin_lock_init(&c->dc_lock);
+               atomic_set(&c->dc_ref, 1);
+               c->dc_free_callback = free_callback;
+               clp->cl_devid_cache = c;
+               dprintk("%s [new]\n", __func__);
+       }
+       spin_unlock(&clp->cl_lock);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
+
+/*
+ * Called from pnfs_layoutdriver_type->free_lseg
+ * last layout segment reference frees deviceid
+ */
+void
+pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+                 struct pnfs_deviceid_node *devid)
+{
+       struct nfs4_deviceid *id = &devid->de_id;
+       struct pnfs_deviceid_node *d;
+       struct hlist_node *n;
+       long h = nfs4_deviceid_hash(id);
+
+       dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
+       if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
+               return;
+
+       hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
+               if (!memcmp(&d->de_id, id, sizeof(*id))) {
+                       hlist_del_rcu(&d->de_node);
+                       spin_unlock(&c->dc_lock);
+                       synchronize_rcu();
+                       c->dc_free_callback(devid);
+                       return;
+               }
+       spin_unlock(&c->dc_lock);
+       /* Why wasn't it found in  the list? */
+       BUG();
+}
+EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
+
+/* Find and reference a deviceid */
+struct pnfs_deviceid_node *
+pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
+{
+       struct pnfs_deviceid_node *d;
+       struct hlist_node *n;
+       long hash = nfs4_deviceid_hash(id);
+
+       dprintk("--> %s hash %ld\n", __func__, hash);
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
+               if (!memcmp(&d->de_id, id, sizeof(*id))) {
+                       if (!atomic_inc_not_zero(&d->de_ref)) {
+                               goto fail;
+                       } else {
+                               rcu_read_unlock();
+                               return d;
+                       }
+               }
+       }
+fail:
+       rcu_read_unlock();
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
+
+/*
+ * Add a deviceid to the cache.
+ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
+ */
+struct pnfs_deviceid_node *
+pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
+{
+       struct pnfs_deviceid_node *d;
+       long hash = nfs4_deviceid_hash(&new->de_id);
+
+       dprintk("--> %s hash %ld\n", __func__, hash);
+       spin_lock(&c->dc_lock);
+       d = pnfs_find_get_deviceid(c, &new->de_id);
+       if (d) {
+               spin_unlock(&c->dc_lock);
+               dprintk("%s [discard]\n", __func__);
+               c->dc_free_callback(new);
+               return d;
+       }
+       INIT_HLIST_NODE(&new->de_node);
+       atomic_set(&new->de_ref, 1);
+       hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
+       spin_unlock(&c->dc_lock);
+       dprintk("%s [new]\n", __func__);
+       return new;
+}
+EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
+
+void
+pnfs_put_deviceid_cache(struct nfs_client *clp)
+{
+       struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
+
+       dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
+       if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
+               int i;
+               /* Verify cache is empty */
+               for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
+                       BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
+               clp->cl_devid_cache = NULL;
+               spin_unlock(&clp->cl_lock);
+               kfree(local);
+       }
+}
+EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
new file mode 100644 (file)
index 0000000..e12367d
--- /dev/null
@@ -0,0 +1,189 @@
+/*
+ *  pNFS client data structures.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+
+#ifndef FS_NFS_PNFS_H
+#define FS_NFS_PNFS_H
+
+struct pnfs_layout_segment {
+       struct list_head fi_list;
+       struct pnfs_layout_range range;
+       struct kref kref;
+       struct pnfs_layout_hdr *layout;
+};
+
+#ifdef CONFIG_NFS_V4_1
+
+#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
+
+enum {
+       NFS_LAYOUT_RO_FAILED = 0,       /* get ro layout failed stop trying */
+       NFS_LAYOUT_RW_FAILED,           /* get rw layout failed stop trying */
+       NFS_LAYOUT_STATEID_SET,         /* have a valid layout stateid */
+};
+
+/* Per-layout driver specific registration structure */
+struct pnfs_layoutdriver_type {
+       struct list_head pnfs_tblid;
+       const u32 id;
+       const char *name;
+       struct module *owner;
+       int (*set_layoutdriver) (struct nfs_server *);
+       int (*clear_layoutdriver) (struct nfs_server *);
+       struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
+       void (*free_lseg) (struct pnfs_layout_segment *lseg);
+};
+
+struct pnfs_layout_hdr {
+       unsigned long           refcount;
+       struct list_head        layouts;   /* other client layouts */
+       struct list_head        segs;      /* layout segments list */
+       seqlock_t               seqlock;   /* Protects the stateid */
+       nfs4_stateid            stateid;
+       unsigned long           state;
+       struct inode            *inode;
+};
+
+struct pnfs_device {
+       struct nfs4_deviceid dev_id;
+       unsigned int  layout_type;
+       unsigned int  mincount;
+       struct page **pages;
+       void          *area;
+       unsigned int  pgbase;
+       unsigned int  pglen;
+};
+
+/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS       5
+#define NFS4_DEVICE_ID_HASH_SIZE       (1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK       (NFS4_DEVICE_ID_HASH_SIZE - 1)
+
+static inline u32
+nfs4_deviceid_hash(struct nfs4_deviceid *id)
+{
+       unsigned char *cptr = (unsigned char *)id->data;
+       unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+       u32 x = 0;
+
+       while (nbytes--) {
+               x *= 37;
+               x += *cptr++;
+       }
+       return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+
+struct pnfs_deviceid_node {
+       struct hlist_node       de_node;
+       struct nfs4_deviceid    de_id;
+       atomic_t                de_ref;
+};
+
+struct pnfs_deviceid_cache {
+       spinlock_t              dc_lock;
+       atomic_t                dc_ref;
+       void                    (*dc_free_callback)(struct pnfs_deviceid_node *);
+       struct hlist_head       dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
+};
+
+extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
+                       void (*free_callback)(struct pnfs_deviceid_node *));
+extern void pnfs_put_deviceid_cache(struct nfs_client *);
+extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
+                               struct pnfs_deviceid_cache *,
+                               struct nfs4_deviceid *);
+extern struct pnfs_deviceid_node *pnfs_add_deviceid(
+                               struct pnfs_deviceid_cache *,
+                               struct pnfs_deviceid_node *);
+extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
+                             struct pnfs_deviceid_node *devid);
+
+extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
+extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+
+/* nfs4proc.c */
+extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+                                  struct pnfs_device *dev);
+extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+
+/* pnfs.c */
+struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+                  enum pnfs_iomode access_type);
+void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
+void unset_pnfs_layoutdriver(struct nfs_server *);
+int pnfs_layout_process(struct nfs4_layoutget *lgp);
+void pnfs_destroy_layout(struct nfs_inode *);
+void pnfs_destroy_all_layouts(struct nfs_client *);
+void put_layout_hdr(struct inode *inode);
+void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+                            struct nfs4_state *open_state);
+
+
+static inline int lo_fail_bit(u32 iomode)
+{
+       return iomode == IOMODE_RW ?
+                        NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
+}
+
+/* Return true if a layout driver is being used for this mountpoint */
+static inline int pnfs_enabled_sb(struct nfs_server *nfss)
+{
+       return nfss->pnfs_curr_ld != NULL;
+}
+
+#else  /* CONFIG_NFS_V4_1 */
+
+static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+}
+
+static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+}
+
+static inline struct pnfs_layout_segment *
+pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+                  enum pnfs_iomode access_type)
+{
+       return NULL;
+}
+
+static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
+{
+}
+
+static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
+{
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+#endif /* FS_NFS_PNFS_H */
index 79859c81a9433a6c615b9f8f32a26db71cea5705..e4b62c6f5a6e9eb721eda53d836c41055ab2e1b7 100644 (file)
@@ -25,6 +25,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY                NFSDBG_PAGECACHE
 
@@ -120,6 +121,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
        len = nfs_page_length(page);
        if (len == 0)
                return nfs_return_empty_page(page);
+       pnfs_update_layout(inode, ctx, IOMODE_READ);
        new = nfs_create_request(ctx, inode, page, 0, len);
        if (IS_ERR(new)) {
                unlock_page(page);
@@ -624,6 +626,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        if (ret == 0)
                goto read_complete; /* all pages were read */
 
+       pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
        if (rsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
        else
index 988cbb3a19b6785378a5dcaab1118e4b3d78b14b..014482c4e57d56e7f15f3cb11d31ee30dbc4330d 100644 (file)
@@ -41,7 +41,6 @@
 
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
-#define NFS4_STATEID_SIZE 16
 
 /* Index of predefined Linux callback client operations */
 
index 07e40c62597211c58d50ce4d61fca14a8021c34f..a9683d6acaa49f9fb16201b16302e69498cad521 100644 (file)
@@ -17,7 +17,9 @@
 
 #define NFS4_BITMAP_SIZE       2
 #define NFS4_VERIFIER_SIZE     8
-#define NFS4_STATEID_SIZE      16
+#define NFS4_STATEID_SEQID_SIZE 4
+#define NFS4_STATEID_OTHER_SIZE 12
+#define NFS4_STATEID_SIZE      (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE)
 #define NFS4_FHSIZE            128
 #define NFS4_MAXPATHLEN                PATH_MAX
 #define NFS4_MAXNAMLEN         NAME_MAX
@@ -167,7 +169,16 @@ struct nfs4_acl {
 };
 
 typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
-typedef struct { char data[NFS4_STATEID_SIZE]; } nfs4_stateid;
+
+struct nfs41_stateid {
+       __be32 seqid;
+       char other[NFS4_STATEID_OTHER_SIZE];
+} __attribute__ ((packed));
+
+typedef union {
+       char data[NFS4_STATEID_SIZE];
+       struct nfs41_stateid stateid;
+} nfs4_stateid;
 
 enum nfs_opnum4 {
        OP_ACCESS = 3,
@@ -471,6 +482,8 @@ enum lock_type4 {
 #define FATTR4_WORD1_TIME_MODIFY        (1UL << 21)
 #define FATTR4_WORD1_TIME_MODIFY_SET    (1UL << 22)
 #define FATTR4_WORD1_MOUNTED_ON_FILEID  (1UL << 23)
+#define FATTR4_WORD1_FS_LAYOUT_TYPES    (1UL << 30)
+#define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
 
 #define NFSPROC4_NULL 0
 #define NFSPROC4_COMPOUND 1
@@ -532,6 +545,8 @@ enum {
        NFSPROC4_CLNT_SEQUENCE,
        NFSPROC4_CLNT_GET_LEASE_TIME,
        NFSPROC4_CLNT_RECLAIM_COMPLETE,
+       NFSPROC4_CLNT_LAYOUTGET,
+       NFSPROC4_CLNT_GETDEVICEINFO,
 };
 
 /* nfs41 types */
@@ -550,6 +565,49 @@ enum state_protect_how4 {
        SP4_SSV         = 2
 };
 
+enum pnfs_layouttype {
+       LAYOUT_NFSV4_1_FILES  = 1,
+       LAYOUT_OSD2_OBJECTS = 2,
+       LAYOUT_BLOCK_VOLUME = 3,
+};
+
+/* used for both layout return and recall */
+enum pnfs_layoutreturn_type {
+       RETURN_FILE = 1,
+       RETURN_FSID = 2,
+       RETURN_ALL  = 3
+};
+
+enum pnfs_iomode {
+       IOMODE_READ = 1,
+       IOMODE_RW = 2,
+       IOMODE_ANY = 3,
+};
+
+enum pnfs_notify_deviceid_type4 {
+       NOTIFY_DEVICEID4_CHANGE = 1 << 1,
+       NOTIFY_DEVICEID4_DELETE = 1 << 2,
+};
+
+#define NFL4_UFLG_MASK                 0x0000003F
+#define NFL4_UFLG_DENSE                        0x00000001
+#define NFL4_UFLG_COMMIT_THRU_MDS      0x00000002
+#define NFL4_UFLG_STRIPE_UNIT_SIZE_MASK        0xFFFFFFC0
+
+/* Encoded in the loh_body field of type layouthint4 */
+enum filelayout_hint_care4 {
+       NFLH4_CARE_DENSE                = NFL4_UFLG_DENSE,
+       NFLH4_CARE_COMMIT_THRU_MDS      = NFL4_UFLG_COMMIT_THRU_MDS,
+       NFLH4_CARE_STRIPE_UNIT_SIZE     = 0x00000040,
+       NFLH4_CARE_STRIPE_COUNT         = 0x00000080
+};
+
+#define NFS4_DEVICEID4_SIZE 16
+
+struct nfs4_deviceid {
+       char data[NFS4_DEVICEID4_SIZE];
+};
+
 #endif
 #endif
 
index a46e430d96226fe8e9f23324c07a313befd5b0ae..bba26684acdc5e0e9364419db9cc3fb7802c1109 100644 (file)
@@ -188,6 +188,9 @@ struct nfs_inode {
        struct nfs_delegation __rcu *delegation;
        fmode_t                  delegation_state;
        struct rw_semaphore     rwsem;
+
+       /* pNFS layout information */
+       struct pnfs_layout_hdr *layout;
 #endif /* CONFIG_NFS_V4*/
 #ifdef CONFIG_NFS_FSCACHE
        struct fscache_cookie   *fscache;
@@ -615,6 +618,8 @@ nfs_fileid_to_ino_t(u64 fileid)
 #define NFSDBG_CLIENT          0x0200
 #define NFSDBG_MOUNT           0x0400
 #define NFSDBG_FSCACHE         0x0800
+#define NFSDBG_PNFS            0x1000
+#define NFSDBG_PNFS_LD         0x2000
 #define NFSDBG_ALL             0xFFFF
 
 #ifdef __KERNEL__
index 5eef862ec1871f6c2c2ffc4125f22daed18540d7..452d96436d266d6ea6cef73f2e3e1dd74a1f76b2 100644 (file)
@@ -82,6 +82,8 @@ struct nfs_client {
        /* The flags used for obtaining the clientid during EXCHANGE_ID */
        u32                     cl_exchange_flags;
        struct nfs4_session     *cl_session;    /* sharred session */
+       struct list_head        cl_layouts;
+       struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
 #endif /* CONFIG_NFS_V4_1 */
 
 #ifdef CONFIG_NFS_FSCACHE
@@ -145,6 +147,7 @@ struct nfs_server {
        u32                     acl_bitmask;    /* V4 bitmask representing the ACEs
                                                   that are supported on this
                                                   filesystem */
+       struct pnfs_layoutdriver_type  *pnfs_curr_ld; /* Active layout driver */
 #endif
        void (*destroy)(struct nfs_server *);
 
index da7a1300dc60bb01423cafd64cb8028b7de5dcb2..ba6cc8f223c94190a13a2e1ef2316f993c205010 100644 (file)
@@ -114,6 +114,7 @@ struct nfs_fsinfo {
        __u64                   maxfilesize;
        struct timespec         time_delta; /* server time granularity */
        __u32                   lease_time; /* in seconds */
+       __u32                   layouttype; /* supported pnfs layout driver */
 };
 
 struct nfs_fsstat {
@@ -186,6 +187,55 @@ struct nfs4_get_lease_time_res {
        struct nfs4_sequence_res        lr_seq_res;
 };
 
+#define PNFS_LAYOUT_MAXSIZE 4096
+
+struct nfs4_layoutdriver_data {
+       __u32 len;
+       void *buf;
+};
+
+struct pnfs_layout_range {
+       u32 iomode;
+       u64 offset;
+       u64 length;
+};
+
+struct nfs4_layoutget_args {
+       __u32 type;
+       struct pnfs_layout_range range;
+       __u64 minlength;
+       __u32 maxcount;
+       struct inode *inode;
+       struct nfs_open_context *ctx;
+       struct nfs4_sequence_args seq_args;
+};
+
+struct nfs4_layoutget_res {
+       __u32 return_on_close;
+       struct pnfs_layout_range range;
+       __u32 type;
+       nfs4_stateid stateid;
+       struct nfs4_layoutdriver_data layout;
+       struct nfs4_sequence_res seq_res;
+};
+
+struct nfs4_layoutget {
+       struct nfs4_layoutget_args args;
+       struct nfs4_layoutget_res res;
+       struct pnfs_layout_segment **lsegpp;
+       int status;
+};
+
+struct nfs4_getdeviceinfo_args {
+       struct pnfs_device *pdev;
+       struct nfs4_sequence_args seq_args;
+};
+
+struct nfs4_getdeviceinfo_res {
+       struct pnfs_device *pdev;
+       struct nfs4_sequence_res seq_res;
+};
+
 /*
  * Arguments to the open call.
  */
index ab91d86565fd6a09babe71008781ab6ecaf595ca..498ab93a81e4d6afb8233e0925671556571c20b2 100644 (file)
@@ -132,6 +132,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp)
        return p + 2;
 }
 
+static inline __be32 *
+xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len)
+{
+       memcpy(ptr, p, len);
+       return p + XDR_QUADLEN(len);
+}
+
 /*
  * Adjust kvec to reflect end of xdr'ed data (RPC client XDR)
  */
index 778e5dfc5144910f83609b8bf48ca2a35011110d..f375decc024b4bbd176ee448fc72e26fc4cd020c 100644 (file)
@@ -427,7 +427,7 @@ static int
 context_derive_keys_rc4(struct krb5_ctx *ctx)
 {
        struct crypto_hash *hmac;
-       char sigkeyconstant[] = "signaturekey";
+       static const char sigkeyconstant[] = "signaturekey";
        int slen = strlen(sigkeyconstant) + 1;  /* include null terminator */
        struct hash_desc desc;
        struct scatterlist sg[1];