]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Merge tag 'ceph-for-4.11-rc1' of git://github.com/ceph/ceph-client
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 28 Feb 2017 23:36:09 +0000 (15:36 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 28 Feb 2017 23:36:09 +0000 (15:36 -0800)
Pull ceph updates from Ilya Dryomov:
 "This time around we have:

   - support for rbd data-pool feature, which enables rbd images on
     erasure-coded pools (myself). CEPH_PG_MAX_SIZE has been bumped to
     allow erasure-coded profiles with k+m up to 32.

   - a patch for ceph_d_revalidate() performance regression introduced
     in 4.9, along with some cleanups in the area (Jeff Layton)

   - a set of fixes for unsafe ->d_parent accesses in CephFS (Jeff
     Layton)

   - buffered reads are now processed in rsize windows instead of rasize
     windows (Andreas Gerstmayr). The new default for rsize mount option
     is 64M.

   - ack vs commit distinction is gone, greatly simplifying ->fsync()
     and MOSDOpReply handling code (myself)

  ... also a few filesystem bug fixes from Zheng, a CRUSH sync up (CRUSH
  computations are still serialized though) and several minor fixes and
  cleanups all over"

* tag 'ceph-for-4.11-rc1' of git://github.com/ceph/ceph-client: (52 commits)
  libceph, rbd, ceph: WRITE | ONDISK -> WRITE
  libceph: get rid of ack vs commit
  ceph: remove special ack vs commit behavior
  ceph: tidy some white space in get_nonsnap_parent()
  crush: fix dprintk compilation
  crush: do is_out test only if we do not collide
  ceph: remove req from unsafe list when unregistering it
  rbd: constify device_type structure
  rbd: kill obj_request->object_name and rbd_segment_name_cache
  rbd: store and use obj_request->object_no
  rbd: RBD_V{1,2}_DATA_FORMAT macros
  rbd: factor out __rbd_osd_req_create()
  rbd: set offset and length outside of rbd_obj_request_create()
  rbd: support for data-pool feature
  rbd: introduce rbd_init_layout()
  rbd: use rbd_obj_bytes() more
  rbd: remove now unused rbd_obj_request_wait() and helpers
  rbd: switch rbd_obj_method_sync() to ceph_osdc_call()
  libceph: pass reply buffer length through ceph_osdc_call()
  rbd: do away with obj_request in rbd_obj_read_sync()
  ...

28 files changed:
Documentation/filesystems/ceph.txt
drivers/block/rbd.c
drivers/block/rbd_types.h
fs/ceph/addr.c
fs/ceph/cache.c
fs/ceph/caps.c
fs/ceph/debugfs.c
fs/ceph/dir.c
fs/ceph/export.c
fs/ceph/file.c
fs/ceph/inode.c
fs/ceph/ioctl.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/super.c
fs/ceph/super.h
include/linux/ceph/osd_client.h
include/linux/ceph/osdmap.h
include/linux/ceph/rados.h
include/linux/crush/crush.h
include/linux/crush/mapper.h
net/ceph/cls_lock_client.c
net/ceph/crush/crush.c
net/ceph/crush/mapper.c
net/ceph/crypto.c
net/ceph/osd_client.c
net/ceph/osdmap.c
net/ceph/snapshot.c

index f5306ee40ea98216602d2214e8944a6aa11edadf..0b302a11718a43fd7ed44725390a5a4ceacb2229 100644 (file)
@@ -98,11 +98,10 @@ Mount Options
        size.
 
   rsize=X
-       Specify the maximum read size in bytes.  By default there is no
-       maximum.
+       Specify the maximum read size in bytes.  Default: 64 MB.
 
   rasize=X
-       Specify the maximum readahead.
+       Specify the maximum readahead.  Default: 8 MB.
 
   mount_timeout=X
        Specify the timeout value for mount (in seconds), in the case
index 362cecc77130260459d81d18d8853f39a7eb35eb..4d680772379828423d8605b1cae8c5da271ec5b8 100644 (file)
@@ -123,9 +123,11 @@ static int atomic_dec_return_safe(atomic_t *v)
 #define RBD_FEATURE_LAYERING   (1<<0)
 #define RBD_FEATURE_STRIPINGV2 (1<<1)
 #define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2)
+#define RBD_FEATURE_DATA_POOL (1<<7)
 #define RBD_FEATURES_ALL       (RBD_FEATURE_LAYERING |         \
                                 RBD_FEATURE_STRIPINGV2 |       \
-                                RBD_FEATURE_EXCLUSIVE_LOCK)
+                                RBD_FEATURE_EXCLUSIVE_LOCK |   \
+                                RBD_FEATURE_DATA_POOL)
 
 /* Features supported by this (client software) implementation. */
 
@@ -144,10 +146,9 @@ struct rbd_image_header {
        /* These six fields never change for a given rbd image */
        char *object_prefix;
        __u8 obj_order;
-       __u8 crypt_type;
-       __u8 comp_type;
        u64 stripe_unit;
        u64 stripe_count;
+       s64 data_pool_id;
        u64 features;           /* Might be changeable someday? */
 
        /* The remaining fields need to be updated occasionally */
@@ -230,7 +231,7 @@ enum obj_req_flags {
 };
 
 struct rbd_obj_request {
-       const char              *object_name;
+       u64                     object_no;
        u64                     offset;         /* object start byte */
        u64                     length;         /* bytes from offset */
        unsigned long           flags;
@@ -438,7 +439,6 @@ static DEFINE_SPINLOCK(rbd_client_list_lock);
 
 static struct kmem_cache       *rbd_img_request_cache;
 static struct kmem_cache       *rbd_obj_request_cache;
-static struct kmem_cache       *rbd_segment_name_cache;
 
 static int rbd_major;
 static DEFINE_IDA(rbd_dev_id_ida);
@@ -972,6 +972,30 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
        return true;
 }
 
+/*
+ * returns the size of an object in the image
+ */
+static u32 rbd_obj_bytes(struct rbd_image_header *header)
+{
+       return 1U << header->obj_order;
+}
+
+static void rbd_init_layout(struct rbd_device *rbd_dev)
+{
+       if (rbd_dev->header.stripe_unit == 0 ||
+           rbd_dev->header.stripe_count == 0) {
+               rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
+               rbd_dev->header.stripe_count = 1;
+       }
+
+       rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
+       rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
+       rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
+       rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
+                         rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
+       RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
+}
+
 /*
  * Fill an rbd image header with information from the given format 1
  * on-disk header.
@@ -992,15 +1016,11 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev,
        /* Allocate this now to avoid having to handle failure below */
 
        if (first_time) {
-               size_t len;
-
-               len = strnlen(ondisk->object_prefix,
-                               sizeof (ondisk->object_prefix));
-               object_prefix = kmalloc(len + 1, GFP_KERNEL);
+               object_prefix = kstrndup(ondisk->object_prefix,
+                                        sizeof(ondisk->object_prefix),
+                                        GFP_KERNEL);
                if (!object_prefix)
                        return -ENOMEM;
-               memcpy(object_prefix, ondisk->object_prefix, len);
-               object_prefix[len] = '\0';
        }
 
        /* Allocate the snapshot context and fill it in */
@@ -1051,12 +1071,7 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev,
        if (first_time) {
                header->object_prefix = object_prefix;
                header->obj_order = ondisk->options.order;
-               header->crypt_type = ondisk->options.crypt_type;
-               header->comp_type = ondisk->options.comp_type;
-               /* The rest aren't used for format 1 images */
-               header->stripe_unit = 0;
-               header->stripe_count = 0;
-               header->features = 0;
+               rbd_init_layout(rbd_dev);
        } else {
                ceph_put_snap_context(header->snapc);
                kfree(header->snap_names);
@@ -1232,42 +1247,9 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
        rbd_dev->mapping.features = 0;
 }
 
-static void rbd_segment_name_free(const char *name)
-{
-       /* The explicit cast here is needed to drop the const qualifier */
-
-       kmem_cache_free(rbd_segment_name_cache, (void *)name);
-}
-
-static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
-{
-       char *name;
-       u64 segment;
-       int ret;
-       char *name_format;
-
-       name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
-       if (!name)
-               return NULL;
-       segment = offset >> rbd_dev->header.obj_order;
-       name_format = "%s.%012llx";
-       if (rbd_dev->image_format == 2)
-               name_format = "%s.%016llx";
-       ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
-                       rbd_dev->header.object_prefix, segment);
-       if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
-               pr_err("error formatting segment name for #%llu (%d)\n",
-                       segment, ret);
-               rbd_segment_name_free(name);
-               name = NULL;
-       }
-
-       return name;
-}
-
 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 {
-       u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
+       u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
 
        return offset & (segment_size - 1);
 }
@@ -1275,7 +1257,7 @@ static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
                                u64 offset, u64 length)
 {
-       u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
+       u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
 
        offset &= segment_size - 1;
 
@@ -1286,14 +1268,6 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev,
        return length;
 }
 
-/*
- * returns the size of an object in the image
- */
-static u64 rbd_obj_bytes(struct rbd_image_header *header)
-{
-       return 1 << header->obj_order;
-}
-
 /*
  * bio helpers
  */
@@ -1623,7 +1597,9 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
 {
        struct ceph_osd_request *osd_req = obj_request->osd_req;
 
-       dout("%s %p osd_req %p\n", __func__, obj_request, osd_req);
+       dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
+            obj_request, obj_request->object_no, obj_request->offset,
+            obj_request->length, osd_req);
        if (obj_request_img_data_test(obj_request)) {
                WARN_ON(obj_request->callback != rbd_img_obj_callback);
                rbd_img_request_get(obj_request->img_request);
@@ -1631,44 +1607,6 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
        ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
 }
 
-static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
-{
-       dout("%s %p\n", __func__, obj_request);
-       ceph_osdc_cancel_request(obj_request->osd_req);
-}
-
-/*
- * Wait for an object request to complete.  If interrupted, cancel the
- * underlying osd request.
- *
- * @timeout: in jiffies, 0 means "wait forever"
- */
-static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
-                                 unsigned long timeout)
-{
-       long ret;
-
-       dout("%s %p\n", __func__, obj_request);
-       ret = wait_for_completion_interruptible_timeout(
-                                       &obj_request->completion,
-                                       ceph_timeout_jiffies(timeout));
-       if (ret <= 0) {
-               if (ret == 0)
-                       ret = -ETIMEDOUT;
-               rbd_obj_request_end(obj_request);
-       } else {
-               ret = 0;
-       }
-
-       dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
-       return ret;
-}
-
-static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
-{
-       return __rbd_obj_request_wait(obj_request, 0);
-}
-
 static void rbd_img_request_complete(struct rbd_img_request *img_request)
 {
 
@@ -1955,8 +1893,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
                rbd_osd_call_callback(obj_request);
                break;
        default:
-               rbd_warn(NULL, "%s: unsupported op %hu",
-                       obj_request->object_name, (unsigned short) opcode);
+               rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d",
+                        obj_request->object_no, opcode);
                break;
        }
 
@@ -1980,6 +1918,40 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
        osd_req->r_data_offset = obj_request->offset;
 }
 
+static struct ceph_osd_request *
+__rbd_osd_req_create(struct rbd_device *rbd_dev,
+                    struct ceph_snap_context *snapc,
+                    int num_ops, unsigned int flags,
+                    struct rbd_obj_request *obj_request)
+{
+       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+       struct ceph_osd_request *req;
+       const char *name_format = rbd_dev->image_format == 1 ?
+                                     RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
+
+       req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
+       if (!req)
+               return NULL;
+
+       req->r_flags = flags;
+       req->r_callback = rbd_osd_req_callback;
+       req->r_priv = obj_request;
+
+       req->r_base_oloc.pool = rbd_dev->layout.pool_id;
+       if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
+                       rbd_dev->header.object_prefix, obj_request->object_no))
+               goto err_req;
+
+       if (ceph_osdc_alloc_messages(req, GFP_NOIO))
+               goto err_req;
+
+       return req;
+
+err_req:
+       ceph_osdc_put_request(req);
+       return NULL;
+}
+
 /*
  * Create an osd request.  A read request has one osd op (read).
  * A write request has either one (watch) or two (hint+write) osd ops.
@@ -1993,8 +1965,6 @@ static struct ceph_osd_request *rbd_osd_req_create(
                                        struct rbd_obj_request *obj_request)
 {
        struct ceph_snap_context *snapc = NULL;
-       struct ceph_osd_client *osdc;
-       struct ceph_osd_request *osd_req;
 
        if (obj_request_img_data_test(obj_request) &&
                (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
@@ -2009,35 +1979,9 @@ static struct ceph_osd_request *rbd_osd_req_create(
 
        rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
 
-       /* Allocate and initialize the request, for the num_ops ops */
-
-       osdc = &rbd_dev->rbd_client->client->osdc;
-       osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
-                                         GFP_NOIO);
-       if (!osd_req)
-               goto fail;
-
-       if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
-               osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
-       else
-               osd_req->r_flags = CEPH_OSD_FLAG_READ;
-
-       osd_req->r_callback = rbd_osd_req_callback;
-       osd_req->r_priv = obj_request;
-
-       osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
-       if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
-                            obj_request->object_name))
-               goto fail;
-
-       if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
-               goto fail;
-
-       return osd_req;
-
-fail:
-       ceph_osdc_put_request(osd_req);
-       return NULL;
+       return __rbd_osd_req_create(rbd_dev, snapc, num_ops,
+           (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ?
+           CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request);
 }
 
 /*
@@ -2050,10 +1994,6 @@ static struct ceph_osd_request *
 rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 {
        struct rbd_img_request *img_request;
-       struct ceph_snap_context *snapc;
-       struct rbd_device *rbd_dev;
-       struct ceph_osd_client *osdc;
-       struct ceph_osd_request *osd_req;
        int num_osd_ops = 3;
 
        rbd_assert(obj_request_img_data_test(obj_request));
@@ -2065,77 +2005,34 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
        if (img_request_discard_test(img_request))
                num_osd_ops = 2;
 
-       /* Allocate and initialize the request, for all the ops */
-
-       snapc = img_request->snapc;
-       rbd_dev = img_request->rbd_dev;
-       osdc = &rbd_dev->rbd_client->client->osdc;
-       osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
-                                               false, GFP_NOIO);
-       if (!osd_req)
-               goto fail;
-
-       osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
-       osd_req->r_callback = rbd_osd_req_callback;
-       osd_req->r_priv = obj_request;
-
-       osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
-       if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
-                            obj_request->object_name))
-               goto fail;
-
-       if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
-               goto fail;
-
-       return osd_req;
-
-fail:
-       ceph_osdc_put_request(osd_req);
-       return NULL;
+       return __rbd_osd_req_create(img_request->rbd_dev,
+                                   img_request->snapc, num_osd_ops,
+                                   CEPH_OSD_FLAG_WRITE, obj_request);
 }
 
-
 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
 {
        ceph_osdc_put_request(osd_req);
 }
 
-/* object_name is assumed to be a non-null pointer and NUL-terminated */
-
-static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
-                                               u64 offset, u64 length,
-                                               enum obj_request_type type)
+static struct rbd_obj_request *
+rbd_obj_request_create(enum obj_request_type type)
 {
        struct rbd_obj_request *obj_request;
-       size_t size;
-       char *name;
 
        rbd_assert(obj_request_type_valid(type));
 
-       size = strlen(object_name) + 1;
-       name = kmalloc(size, GFP_NOIO);
-       if (!name)
-               return NULL;
-
        obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
-       if (!obj_request) {
-               kfree(name);
+       if (!obj_request)
                return NULL;
-       }
 
-       obj_request->object_name = memcpy(name, object_name, size);
-       obj_request->offset = offset;
-       obj_request->length = length;
-       obj_request->flags = 0;
        obj_request->which = BAD_WHICH;
        obj_request->type = type;
        INIT_LIST_HEAD(&obj_request->links);
        init_completion(&obj_request->completion);
        kref_init(&obj_request->kref);
 
-       dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
-               offset, length, (int)type, obj_request);
-
+       dout("%s %p\n", __func__, obj_request);
        return obj_request;
 }
 
@@ -2170,8 +2067,6 @@ static void rbd_obj_request_destroy(struct kref *kref)
                break;
        }
 
-       kfree(obj_request->object_name);
-       obj_request->object_name = NULL;
        kmem_cache_free(rbd_obj_request_cache, obj_request);
 }
 
@@ -2546,22 +2441,18 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
 
        while (resid) {
                struct ceph_osd_request *osd_req;
-               const char *object_name;
-               u64 offset;
-               u64 length;
+               u64 object_no = img_offset >> rbd_dev->header.obj_order;
+               u64 offset = rbd_segment_offset(rbd_dev, img_offset);
+               u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
 
-               object_name = rbd_segment_name(rbd_dev, img_offset);
-               if (!object_name)
-                       goto out_unwind;
-               offset = rbd_segment_offset(rbd_dev, img_offset);
-               length = rbd_segment_length(rbd_dev, img_offset, resid);
-               obj_request = rbd_obj_request_create(object_name,
-                                               offset, length, type);
-               /* object request has its own copy of the object name */
-               rbd_segment_name_free(object_name);
+               obj_request = rbd_obj_request_create(type);
                if (!obj_request)
                        goto out_unwind;
 
+               obj_request->object_no = object_no;
+               obj_request->offset = offset;
+               obj_request->length = length;
+
                /*
                 * set obj_request->img_request before creating the
                 * osd_request so that it gets the right snapc
@@ -2771,7 +2662,7 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
         * child image to which the original request was to be sent.
         */
        img_offset = obj_request->img_offset - obj_request->offset;
-       length = (u64)1 << rbd_dev->header.obj_order;
+       length = rbd_obj_bytes(&rbd_dev->header);
 
        /*
         * There is no defined parent data beyond the parent
@@ -2900,11 +2791,12 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
        size_t size;
        int ret;
 
-       stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
-                                             OBJ_REQUEST_PAGES);
+       stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES);
        if (!stat_request)
                return -ENOMEM;
 
+       stat_request->object_no = obj_request->object_no;
+
        stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
                                                   stat_request);
        if (!stat_request->osd_req) {
@@ -3983,17 +3875,17 @@ out:
  * returned in the outbound buffer, or a negative error code.
  */
 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
-                            const char *object_name,
-                            const char *class_name,
+                            struct ceph_object_id *oid,
+                            struct ceph_object_locator *oloc,
                             const char *method_name,
                             const void *outbound,
                             size_t outbound_size,
                             void *inbound,
                             size_t inbound_size)
 {
-       struct rbd_obj_request *obj_request;
-       struct page **pages;
-       u32 page_count;
+       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+       struct page *req_page = NULL;
+       struct page *reply_page;
        int ret;
 
        /*
@@ -4003,61 +3895,35 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
         * method.  Currently if this is present it will be a
         * snapshot id.
         */
-       page_count = (u32)calc_pages_for(0, inbound_size);
-       pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
-       if (IS_ERR(pages))
-               return PTR_ERR(pages);
-
-       ret = -ENOMEM;
-       obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
-                                                       OBJ_REQUEST_PAGES);
-       if (!obj_request)
-               goto out;
+       if (outbound) {
+               if (outbound_size > PAGE_SIZE)
+                       return -E2BIG;
 
-       obj_request->pages = pages;
-       obj_request->page_count = page_count;
-
-       obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
-                                                 obj_request);
-       if (!obj_request->osd_req)
-               goto out;
-
-       osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
-                                       class_name, method_name);
-       if (outbound_size) {
-               struct ceph_pagelist *pagelist;
-
-               pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
-               if (!pagelist)
-                       goto out;
+               req_page = alloc_page(GFP_KERNEL);
+               if (!req_page)
+                       return -ENOMEM;
 
-               ceph_pagelist_init(pagelist);
-               ceph_pagelist_append(pagelist, outbound, outbound_size);
-               osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
-                                               pagelist);
+               memcpy(page_address(req_page), outbound, outbound_size);
        }
-       osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
-                                       obj_request->pages, inbound_size,
-                                       0, false, false);
-
-       rbd_obj_request_submit(obj_request);
-       ret = rbd_obj_request_wait(obj_request);
-       if (ret)
-               goto out;
 
-       ret = obj_request->result;
-       if (ret < 0)
-               goto out;
+       reply_page = alloc_page(GFP_KERNEL);
+       if (!reply_page) {
+               if (req_page)
+                       __free_page(req_page);
+               return -ENOMEM;
+       }
 
-       rbd_assert(obj_request->xferred < (u64)INT_MAX);
-       ret = (int)obj_request->xferred;
-       ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
-out:
-       if (obj_request)
-               rbd_obj_request_put(obj_request);
-       else
-               ceph_release_page_vector(pages, page_count);
+       ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
+                            CEPH_OSD_FLAG_READ, req_page, outbound_size,
+                            reply_page, &inbound_size);
+       if (!ret) {
+               memcpy(inbound, page_address(reply_page), inbound_size);
+               ret = inbound_size;
+       }
 
+       if (req_page)
+               __free_page(req_page);
+       __free_page(reply_page);
        return ret;
 }
 
@@ -4256,63 +4122,46 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
 }
 
 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
-                               const char *object_name,
-                               u64 offset, u64 length, void *buf)
+                            struct ceph_object_id *oid,
+                            struct ceph_object_locator *oloc,
+                            void *buf, int buf_len)
 
 {
-       struct rbd_obj_request *obj_request;
-       struct page **pages = NULL;
-       u32 page_count;
-       size_t size;
+       struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+       struct ceph_osd_request *req;
+       struct page **pages;
+       int num_pages = calc_pages_for(0, buf_len);
        int ret;
 
-       page_count = (u32) calc_pages_for(offset, length);
-       pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
-       if (IS_ERR(pages))
-               return PTR_ERR(pages);
-
-       ret = -ENOMEM;
-       obj_request = rbd_obj_request_create(object_name, offset, length,
-                                                       OBJ_REQUEST_PAGES);
-       if (!obj_request)
-               goto out;
-
-       obj_request->pages = pages;
-       obj_request->page_count = page_count;
-
-       obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
-                                                 obj_request);
-       if (!obj_request->osd_req)
-               goto out;
+       req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
+       if (!req)
+               return -ENOMEM;
 
-       osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
-                                       offset, length, 0, 0);
-       osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
-                                       obj_request->pages,
-                                       obj_request->length,
-                                       obj_request->offset & ~PAGE_MASK,
-                                       false, false);
+       ceph_oid_copy(&req->r_base_oid, oid);
+       ceph_oloc_copy(&req->r_base_oloc, oloc);
+       req->r_flags = CEPH_OSD_FLAG_READ;
 
-       rbd_obj_request_submit(obj_request);
-       ret = rbd_obj_request_wait(obj_request);
+       ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
        if (ret)
-               goto out;
+               goto out_req;
 
-       ret = obj_request->result;
-       if (ret < 0)
-               goto out;
+       pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+       if (IS_ERR(pages)) {
+               ret = PTR_ERR(pages);
+               goto out_req;
+       }
 
-       rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
-       size = (size_t) obj_request->xferred;
-       ceph_copy_from_page_vector(pages, buf, 0, size);
-       rbd_assert(size <= (size_t)INT_MAX);
-       ret = (int)size;
-out:
-       if (obj_request)
-               rbd_obj_request_put(obj_request);
-       else
-               ceph_release_page_vector(pages, page_count);
+       osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
+       osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
+                                        true);
+
+       ceph_osdc_start_request(osdc, req, false);
+       ret = ceph_osdc_wait_request(osdc, req);
+       if (ret >= 0)
+               ceph_copy_from_page_vector(pages, buf, 0, ret);
 
+out_req:
+       ceph_osdc_put_request(req);
        return ret;
 }
 
@@ -4348,8 +4197,8 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
                if (!ondisk)
                        return -ENOMEM;
 
-               ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
-                                      0, size, ondisk);
+               ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
+                                       &rbd_dev->header_oloc, ondisk, size);
                if (ret < 0)
                        goto out;
                if ((size_t)ret < size) {
@@ -4781,7 +4630,7 @@ static const struct attribute_group *rbd_attr_groups[] = {
 
 static void rbd_dev_release(struct device *dev);
 
-static struct device_type rbd_device_type = {
+static const struct device_type rbd_device_type = {
        .name           = "rbd",
        .groups         = rbd_attr_groups,
        .release        = rbd_dev_release,
@@ -4876,8 +4725,9 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
        INIT_LIST_HEAD(&rbd_dev->node);
        init_rwsem(&rbd_dev->header_rwsem);
 
+       rbd_dev->header.data_pool_id = CEPH_NOPOOL;
        ceph_oid_init(&rbd_dev->header_oid);
-       ceph_oloc_init(&rbd_dev->header_oloc);
+       rbd_dev->header_oloc.pool = spec->pool_id;
 
        mutex_init(&rbd_dev->watch_mutex);
        rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
@@ -4899,12 +4749,6 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
        rbd_dev->rbd_client = rbdc;
        rbd_dev->spec = spec;
 
-       rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER;
-       rbd_dev->layout.stripe_count = 1;
-       rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
-       rbd_dev->layout.pool_id = spec->pool_id;
-       RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
-
        return rbd_dev;
 }
 
@@ -4970,10 +4814,10 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
                __le64 size;
        } __attribute__ ((packed)) size_buf = { 0 };
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_size",
-                               &snapid, sizeof (snapid),
-                               &size_buf, sizeof (size_buf));
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_size",
+                                 &snapid, sizeof(snapid),
+                                 &size_buf, sizeof(size_buf));
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                return ret;
@@ -5010,9 +4854,9 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
        if (!reply_buf)
                return -ENOMEM;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_object_prefix", NULL, 0,
-                               reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_object_prefix",
+                                 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                goto out;
@@ -5045,10 +4889,10 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
        u64 unsup;
        int ret;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_features",
-                               &snapid, sizeof (snapid),
-                               &features_buf, sizeof (features_buf));
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_features",
+                                 &snapid, sizeof(snapid),
+                                 &features_buf, sizeof(features_buf));
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                return ret;
@@ -5107,10 +4951,9 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
        }
 
        snapid = cpu_to_le64(rbd_dev->spec->snap_id);
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_parent",
-                               &snapid, sizeof (snapid),
-                               reply_buf, size);
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_parent",
+                                 &snapid, sizeof(snapid), reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                goto out_err;
@@ -5210,9 +5053,9 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
        u64 stripe_count;
        int ret;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_stripe_unit_count", NULL, 0,
-                               (char *)&striping_info_buf, size);
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                               &rbd_dev->header_oloc, "get_stripe_unit_count",
+                               NULL, 0, &striping_info_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                return ret;
@@ -5226,7 +5069,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
         * out, and only fail if the image has non-default values.
         */
        ret = -EINVAL;
-       obj_size = (u64)1 << rbd_dev->header.obj_order;
+       obj_size = rbd_obj_bytes(&rbd_dev->header);
        p = &striping_info_buf;
        stripe_unit = ceph_decode_64(&p);
        if (stripe_unit != obj_size) {
@@ -5247,8 +5090,27 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
        return 0;
 }
 
+static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
+{
+       __le64 data_pool_id;
+       int ret;
+
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_data_pool",
+                                 NULL, 0, &data_pool_id, sizeof(data_pool_id));
+       if (ret < 0)
+               return ret;
+       if (ret < sizeof(data_pool_id))
+               return -EBADMSG;
+
+       rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
+       WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
+       return 0;
+}
+
 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
 {
+       CEPH_DEFINE_OID_ONSTACK(oid);
        size_t image_id_size;
        char *image_id;
        void *p;
@@ -5276,10 +5138,10 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
        if (!reply_buf)
                goto out;
 
-       ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
-                               "rbd", "dir_get_name",
-                               image_id, image_id_size,
-                               reply_buf, size);
+       ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
+       ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
+                                 "dir_get_name", image_id, image_id_size,
+                                 reply_buf, size);
        if (ret < 0)
                goto out;
        p = reply_buf;
@@ -5458,9 +5320,9 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
        if (!reply_buf)
                return -ENOMEM;
 
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_snapcontext", NULL, 0,
-                               reply_buf, size);
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_snapcontext",
+                                 NULL, 0, reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                goto out;
@@ -5523,10 +5385,9 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
                return ERR_PTR(-ENOMEM);
 
        snapid = cpu_to_le64(snap_id);
-       ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
-                               "rbd", "get_snapshot_name",
-                               &snapid, sizeof (snapid),
-                               reply_buf, size);
+       ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+                                 &rbd_dev->header_oloc, "get_snapshot_name",
+                                 &snapid, sizeof(snapid), reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0) {
                snap_name = ERR_PTR(ret);
@@ -5833,7 +5694,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 {
        int ret;
        size_t size;
-       char *object_name;
+       CEPH_DEFINE_OID_ONSTACK(oid);
        void *response;
        char *image_id;
 
@@ -5853,12 +5714,12 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
         * First, see if the format 2 image id file exists, and if
         * so, get the image's persistent id from it.
         */
-       size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
-       object_name = kmalloc(size, GFP_NOIO);
-       if (!object_name)
-               return -ENOMEM;
-       sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
-       dout("rbd id object name is %s\n", object_name);
+       ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
+                              rbd_dev->spec->image_name);
+       if (ret)
+               return ret;
+
+       dout("rbd id object name is %s\n", oid.name);
 
        /* Response will be an encoded string, which includes a length */
 
@@ -5871,9 +5732,9 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
 
        /* If it doesn't exist we'll assume it's a format 1 image */
 
-       ret = rbd_obj_method_sync(rbd_dev, object_name,
-                               "rbd", "get_id", NULL, 0,
-                               response, RBD_IMAGE_ID_LEN_MAX);
+       ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
+                                 "get_id", NULL, 0,
+                                 response, RBD_IMAGE_ID_LEN_MAX);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret == -ENOENT) {
                image_id = kstrdup("", GFP_KERNEL);
@@ -5896,8 +5757,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
        }
 out:
        kfree(response);
-       kfree(object_name);
-
+       ceph_oid_destroy(&oid);
        return ret;
 }
 
@@ -5944,14 +5804,20 @@ static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
                if (ret < 0)
                        goto out_err;
        }
-       /* No support for crypto and compression type format 2 images */
 
+       if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
+               ret = rbd_dev_v2_data_pool(rbd_dev);
+               if (ret)
+                       goto out_err;
+       }
+
+       rbd_init_layout(rbd_dev);
        return 0;
+
 out_err:
        rbd_dev->header.features = 0;
        kfree(rbd_dev->header.object_prefix);
        rbd_dev->header.object_prefix = NULL;
-
        return ret;
 }
 
@@ -6077,8 +5943,6 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev)
        /* Record the header object name for this rbd image. */
 
        rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
-
-       rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id;
        if (rbd_dev->image_format == 1)
                ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
                                       spec->image_name, RBD_SUFFIX);
@@ -6471,27 +6335,16 @@ static int rbd_slab_init(void)
        if (!rbd_obj_request_cache)
                goto out_err;
 
-       rbd_assert(!rbd_segment_name_cache);
-       rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
-                                       CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
-       if (rbd_segment_name_cache)
-               return 0;
-out_err:
-       kmem_cache_destroy(rbd_obj_request_cache);
-       rbd_obj_request_cache = NULL;
+       return 0;
 
+out_err:
        kmem_cache_destroy(rbd_img_request_cache);
        rbd_img_request_cache = NULL;
-
        return -ENOMEM;
 }
 
 static void rbd_slab_exit(void)
 {
-       rbd_assert(rbd_segment_name_cache);
-       kmem_cache_destroy(rbd_segment_name_cache);
-       rbd_segment_name_cache = NULL;
-
        rbd_assert(rbd_obj_request_cache);
        kmem_cache_destroy(rbd_obj_request_cache);
        rbd_obj_request_cache = NULL;
index 94f367db27b0b816e9585da18f0063a1523b23ca..62ff50d3e7a6f1f4da2d10249512f9c2cff1859f 100644 (file)
@@ -25,8 +25,8 @@
  */
 
 #define RBD_HEADER_PREFIX      "rbd_header."
-#define RBD_DATA_PREFIX        "rbd_data."
 #define RBD_ID_PREFIX          "rbd_id."
+#define RBD_V2_DATA_FORMAT     "%s.%016llx"
 
 #define RBD_LOCK_NAME          "rbd_lock"
 #define RBD_LOCK_TAG           "internal"
@@ -42,13 +42,14 @@ enum rbd_notify_op {
 /*
  * For format version 1, rbd image 'foo' consists of objects
  *   foo.rbd           - image metadata
- *   rb.<idhi>.<idlo>.00000000
- *   rb.<idhi>.<idlo>.00000001
+ *   rb.<idhi>.<idlo>.<extra>.000000000000
+ *   rb.<idhi>.<idlo>.<extra>.000000000001
  *   ...               - data
  * There is no notion of a persistent image id in rbd format 1.
  */
 
 #define RBD_SUFFIX             ".rbd"
+#define RBD_V1_DATA_FORMAT     "%s.%012llx"
 
 #define RBD_DIRECTORY           "rbd_directory"
 #define RBD_INFO                "rbd_info"
@@ -57,9 +58,6 @@ enum rbd_notify_op {
 #define RBD_MIN_OBJ_ORDER       16
 #define RBD_MAX_OBJ_ORDER       30
 
-#define RBD_COMP_NONE          0
-#define RBD_CRYPT_NONE         0
-
 #define RBD_HEADER_TEXT                "<<< Rados Block Device Image >>>\n"
 #define RBD_HEADER_SIGNATURE   "RBD"
 #define RBD_HEADER_VERSION     "001.005"
index 7ce35aec8c765566bd4eb87fd473fdceddf3ec92..f297a9e1864293d4eedfac5dea5957e65af1b25d 100644 (file)
@@ -391,6 +391,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
                        nr_pages = i;
                        if (nr_pages > 0) {
                                len = nr_pages << PAGE_SHIFT;
+                               osd_req_op_extent_update(req, 0, len);
                                break;
                        }
                        goto out_pages;
@@ -771,7 +772,7 @@ static int ceph_writepages_start(struct address_space *mapping,
             wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-       if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+       if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
                if (ci->i_wrbuffer_ref > 0) {
                        pr_warn_ratelimited(
                                "writepage_start %p %lld forced umount\n",
@@ -1017,8 +1018,7 @@ new_request:
                                        &ci->i_layout, vino,
                                        offset, &len, 0, num_ops,
                                        CEPH_OSD_OP_WRITE,
-                                       CEPH_OSD_FLAG_WRITE |
-                                       CEPH_OSD_FLAG_ONDISK,
+                                       CEPH_OSD_FLAG_WRITE,
                                        snapc, truncate_seq,
                                        truncate_size, false);
                if (IS_ERR(req)) {
@@ -1028,8 +1028,7 @@ new_request:
                                                min(num_ops,
                                                    CEPH_OSD_SLAB_OPS),
                                                CEPH_OSD_OP_WRITE,
-                                               CEPH_OSD_FLAG_WRITE |
-                                               CEPH_OSD_FLAG_ONDISK,
+                                               CEPH_OSD_FLAG_WRITE,
                                                snapc, truncate_seq,
                                                truncate_size, true);
                        BUG_ON(IS_ERR(req));
@@ -1194,7 +1193,7 @@ static int ceph_update_writeable_page(struct file *file,
        int r;
        struct ceph_snap_context *snapc, *oldest;
 
-       if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+       if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
                dout(" page %p forced umount\n", page);
                unlock_page(page);
                return -EIO;
@@ -1681,8 +1680,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 
        req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                    ceph_vino(inode), 0, &len, 0, 1,
-                                   CEPH_OSD_OP_CREATE,
-                                   CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+                                   CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
                                    NULL, 0, 0, false);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
@@ -1699,8 +1697,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 
        req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                    ceph_vino(inode), 0, &len, 1, 3,
-                                   CEPH_OSD_OP_WRITE,
-                                   CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+                                   CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
                                    NULL, ci->i_truncate_seq,
                                    ci->i_truncate_size, false);
        if (IS_ERR(req)) {
@@ -1873,7 +1870,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
                goto out_unlock;
        }
 
-       wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
+       wr_req->r_flags = CEPH_OSD_FLAG_WRITE;
        osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
        ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
        ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
index 5bc5d37b121712a2f288ede38b46420d13a2f0e5..4e7421caf3804c49ef052c02a736965e29437876 100644 (file)
@@ -234,7 +234,7 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
                fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable,
                                inode);
                if (fscache_cookie_enabled(ci->fscache)) {
-                       dout("fscache_file_set_cookie %p %p enabing cache\n",
+                       dout("fscache_file_set_cookie %p %p enabling cache\n",
                             inode, filp);
                }
        }
index 94fd76d04683d88103b42ff71a02490201a9783f..cd966f276a8d70ee9a3daa50c46eee5b1284f37a 100644 (file)
@@ -867,7 +867,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 /*
  * Return caps we have registered with the MDS(s) as 'wanted'.
  */
-int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
 {
        struct ceph_cap *cap;
        struct rb_node *p;
@@ -875,7 +875,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
 
        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
                cap = rb_entry(p, struct ceph_cap, ci_node);
-               if (!__cap_is_valid(cap))
+               if (check && !__cap_is_valid(cap))
                        continue;
                if (cap == ci->i_auth_cap)
                        mds_wanted |= cap->mds_wanted;
@@ -1184,6 +1184,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
                delayed = 1;
        }
        ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+       if (want & ~cap->mds_wanted) {
+               /* user space may open/close single file frequently.
+                * This avoids droping mds_wanted immediately after
+                * requesting new mds_wanted.
+                */
+               __cap_set_timeouts(mdsc, ci);
+       }
 
        cap->issued &= retain;  /* drop bits we don't want */
        if (cap->implemented & ~cap->issued) {
@@ -2084,8 +2091,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 
        dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
 
-       ceph_sync_write_wait(inode);
-
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (ret < 0)
                goto out;
@@ -2477,23 +2482,22 @@ again:
 
                if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
                        int mds_wanted;
-                       if (ACCESS_ONCE(mdsc->fsc->mount_state) ==
+                       if (READ_ONCE(mdsc->fsc->mount_state) ==
                            CEPH_MOUNT_SHUTDOWN) {
                                dout("get_cap_refs %p forced umount\n", inode);
                                *err = -EIO;
                                ret = 1;
                                goto out_unlock;
                        }
-                       mds_wanted = __ceph_caps_mds_wanted(ci);
-                       if ((mds_wanted & need) != need) {
+                       mds_wanted = __ceph_caps_mds_wanted(ci, false);
+                       if (need & ~(mds_wanted & need)) {
                                dout("get_cap_refs %p caps were dropped"
                                     " (session killed?)\n", inode);
                                *err = -ESTALE;
                                ret = 1;
                                goto out_unlock;
                        }
-                       if ((mds_wanted & file_wanted) ==
-                           (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+                       if (!(file_wanted & ~mds_wanted))
                                ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
                }
 
@@ -3404,6 +3408,7 @@ retry:
                        tcap->implemented |= issued;
                        if (cap == ci->i_auth_cap)
                                ci->i_auth_cap = tcap;
+
                        if (!list_empty(&ci->i_cap_flush_list) &&
                            ci->i_auth_cap == tcap) {
                                spin_lock(&mdsc->cap_dirty_lock);
@@ -3417,9 +3422,18 @@ retry:
        } else if (tsession) {
                /* add placeholder for the export tagert */
                int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
+               tcap = new_cap;
                ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
                             t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
 
+               if (!list_empty(&ci->i_cap_flush_list) &&
+                   ci->i_auth_cap == tcap) {
+                       spin_lock(&mdsc->cap_dirty_lock);
+                       list_move_tail(&ci->i_flushing_item,
+                                      &tcap->session->s_cap_flushing);
+                       spin_unlock(&mdsc->cap_dirty_lock);
+               }
+
                __ceph_remove_cap(cap, false);
                goto out_unlock;
        }
@@ -3924,9 +3938,10 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 }
 
 int ceph_encode_dentry_release(void **p, struct dentry *dentry,
+                              struct inode *dir,
                               int mds, int drop, int unless)
 {
-       struct inode *dir = d_inode(dentry->d_parent);
+       struct dentry *parent = NULL;
        struct ceph_mds_request_release *rel = *p;
        struct ceph_dentry_info *di = ceph_dentry(dentry);
        int force = 0;
@@ -3941,9 +3956,14 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
        spin_lock(&dentry->d_lock);
        if (di->lease_session && di->lease_session->s_mds == mds)
                force = 1;
+       if (!dir) {
+               parent = dget(dentry->d_parent);
+               dir = d_inode(parent);
+       }
        spin_unlock(&dentry->d_lock);
 
        ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+       dput(parent);
 
        spin_lock(&dentry->d_lock);
        if (ret && di->lease_session && di->lease_session->s_mds == mds) {
index 39ff678e567fcb5c31d9729081119adaa4578def..f2ae393e2c31a2b3dbca7a5f81eeb5b81afa5e1b 100644 (file)
@@ -70,7 +70,7 @@ static int mdsc_show(struct seq_file *s, void *p)
 
                seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
 
-               if (req->r_got_unsafe)
+               if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
                        seq_puts(s, "\t(unsafe)");
                else
                        seq_puts(s, "\t");
index 8ab1fdf0bd49b74f380a578aea92ce738393403d..3e9ad501addfe92f171a40dffb93c65209819cbe 100644 (file)
@@ -371,7 +371,7 @@ more:
                /* hints to request -> mds selection code */
                req->r_direct_mode = USE_AUTH_MDS;
                req->r_direct_hash = ceph_frag_value(frag);
-               req->r_direct_is_hash = true;
+               __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
                if (fi->last_name) {
                        req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
                        if (!req->r_path2) {
@@ -417,7 +417,7 @@ more:
                fi->frag = frag;
                fi->last_readdir = req;
 
-               if (req->r_did_prepopulate) {
+               if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
                        fi->readdir_cache_idx = req->r_readdir_cache_idx;
                        if (fi->readdir_cache_idx < 0) {
                                /* preclude from marking dir ordered */
@@ -752,7 +752,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                mask |= CEPH_CAP_XATTR_SHARED;
        req->r_args.getattr.mask = cpu_to_le32(mask);
 
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        err = ceph_mdsc_do_request(mdsc, NULL, req);
        err = ceph_handle_snapdir(req, dentry, err);
        dentry = ceph_finish_lookup(req, dentry, err);
@@ -813,7 +814,8 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
        }
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_args.mknod.mode = cpu_to_le32(mode);
        req->r_args.mknod.rdev = cpu_to_le32(rdev);
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
@@ -864,7 +866,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
                ceph_mdsc_put_request(req);
                goto out;
        }
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
@@ -913,7 +916,8 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_args.mkdir.mode = cpu_to_le32(mode);
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -957,7 +961,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        req->r_old_dentry = dget(old_dentry);
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
        /* release LINK_SHARED on source inode (mds will lock it) */
@@ -1023,7 +1028,8 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
        }
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
-       req->r_locked_dir = dir;
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
        req->r_inode_drop = drop_caps_for_unlink(inode);
@@ -1066,7 +1072,8 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
        req->r_num_caps = 2;
        req->r_old_dentry = dget(old_dentry);
        req->r_old_dentry_dir = old_dir;
-       req->r_locked_dir = new_dir;
+       req->r_parent = new_dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
@@ -1194,7 +1201,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
        struct inode *dir;
 
        if (flags & LOOKUP_RCU) {
-               parent = ACCESS_ONCE(dentry->d_parent);
+               parent = READ_ONCE(dentry->d_parent);
                dir = d_inode_rcu(parent);
                if (!dir)
                        return -ECHILD;
@@ -1237,11 +1244,12 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
                        return -ECHILD;
 
                op = ceph_snap(dir) == CEPH_SNAPDIR ?
-                       CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_GETATTR;
+                       CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
                req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
                if (!IS_ERR(req)) {
                        req->r_dentry = dget(dentry);
-                       req->r_num_caps = op == CEPH_MDS_OP_GETATTR ? 1 : 2;
+                       req->r_num_caps = 2;
+                       req->r_parent = dir;
 
                        mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
                        if (ceph_security_xattr_wanted(dir))
index 180bbef760f2c8c12fd94d458c246014634233dd..e8f11fa565c53ac58fddf402f6ade6320d47d490 100644 (file)
@@ -207,7 +207,8 @@ static int ceph_get_name(struct dentry *parent, char *name,
        req->r_inode = d_inode(child);
        ihold(d_inode(child));
        req->r_ino2 = ceph_vino(d_inode(parent));
-       req->r_locked_dir = d_inode(parent);
+       req->r_parent = d_inode(parent);
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_num_caps = 2;
        err = ceph_mdsc_do_request(mdsc, NULL, req);
 
index 045d30d2662485a4207945757659383fb314fa10..26cc95421cca6e62ef10bfd32b18cf1e526b7c51 100644 (file)
@@ -283,7 +283,7 @@ int ceph_open(struct inode *inode, struct file *file)
        spin_lock(&ci->i_ceph_lock);
        if (__ceph_is_any_real_caps(ci) &&
            (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
-               int mds_wanted = __ceph_caps_mds_wanted(ci);
+               int mds_wanted = __ceph_caps_mds_wanted(ci, true);
                int issued = __ceph_caps_issued(ci, NULL);
 
                dout("open %p fmode %d want %s issued %s using existing\n",
@@ -379,7 +379,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
                mask |= CEPH_CAP_XATTR_SHARED;
        req->r_args.open.mask = cpu_to_le32(mask);
 
-       req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
+       req->r_parent = dir;
+       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        err = ceph_mdsc_do_request(mdsc,
                                   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
                                   req);
@@ -758,9 +759,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
                goto out;
        }
 
-       req->r_flags =  CEPH_OSD_FLAG_ORDERSNAP |
-                       CEPH_OSD_FLAG_ONDISK |
-                       CEPH_OSD_FLAG_WRITE;
+       req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
        ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
        ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
 
@@ -794,89 +793,6 @@ out:
        kfree(aio_work);
 }
 
-/*
- * Write commit request unsafe callback, called to tell us when a
- * request is unsafe (that is, in flight--has been handed to the
- * messenger to send to its target osd).  It is called again when
- * we've received a response message indicating the request is
- * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request
- * is completed early (and unsuccessfully) due to a timeout or
- * interrupt.
- *
- * This is used if we requested both an ACK and ONDISK commit reply
- * from the OSD.
- */
-static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
-{
-       struct ceph_inode_info *ci = ceph_inode(req->r_inode);
-
-       dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid,
-               unsafe ? "un" : "");
-       if (unsafe) {
-               ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
-               spin_lock(&ci->i_unsafe_lock);
-               list_add_tail(&req->r_unsafe_item,
-                             &ci->i_unsafe_writes);
-               spin_unlock(&ci->i_unsafe_lock);
-
-               complete_all(&req->r_completion);
-       } else {
-               spin_lock(&ci->i_unsafe_lock);
-               list_del_init(&req->r_unsafe_item);
-               spin_unlock(&ci->i_unsafe_lock);
-               ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
-       }
-}
-
-/*
- * Wait on any unsafe replies for the given inode.  First wait on the
- * newest request, and make that the upper bound.  Then, if there are
- * more requests, keep waiting on the oldest as long as it is still older
- * than the original request.
- */
-void ceph_sync_write_wait(struct inode *inode)
-{
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       struct list_head *head = &ci->i_unsafe_writes;
-       struct ceph_osd_request *req;
-       u64 last_tid;
-
-       if (!S_ISREG(inode->i_mode))
-               return;
-
-       spin_lock(&ci->i_unsafe_lock);
-       if (list_empty(head))
-               goto out;
-
-       /* set upper bound as _last_ entry in chain */
-
-       req = list_last_entry(head, struct ceph_osd_request,
-                             r_unsafe_item);
-       last_tid = req->r_tid;
-
-       do {
-               ceph_osdc_get_request(req);
-               spin_unlock(&ci->i_unsafe_lock);
-
-               dout("sync_write_wait on tid %llu (until %llu)\n",
-                    req->r_tid, last_tid);
-               wait_for_completion(&req->r_done_completion);
-               ceph_osdc_put_request(req);
-
-               spin_lock(&ci->i_unsafe_lock);
-               /*
-                * from here on look at first entry in chain, since we
-                * only want to wait for anything older than last_tid
-                */
-               if (list_empty(head))
-                       break;
-               req = list_first_entry(head, struct ceph_osd_request,
-                                      r_unsafe_item);
-       } while (req->r_tid < last_tid);
-out:
-       spin_unlock(&ci->i_unsafe_lock);
-}
-
 static ssize_t
 ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                       struct ceph_snap_context *snapc,
@@ -915,9 +831,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
                if (ret2 < 0)
                        dout("invalidate_inode_pages2_range returned %d\n", ret2);
 
-               flags = CEPH_OSD_FLAG_ORDERSNAP |
-                       CEPH_OSD_FLAG_ONDISK |
-                       CEPH_OSD_FLAG_WRITE;
+               flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
        } else {
                flags = CEPH_OSD_FLAG_READ;
        }
@@ -1116,10 +1030,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
        if (ret < 0)
                dout("invalidate_inode_pages2_range returned %d\n", ret);
 
-       flags = CEPH_OSD_FLAG_ORDERSNAP |
-               CEPH_OSD_FLAG_ONDISK |
-               CEPH_OSD_FLAG_WRITE |
-               CEPH_OSD_FLAG_ACK;
+       flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
 
        while ((len = iov_iter_count(from)) > 0) {
                size_t left;
@@ -1165,8 +1076,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
                        goto out;
                }
 
-               /* get a second commit callback */
-               req->r_unsafe_callback = ceph_sync_write_unsafe;
                req->r_inode = inode;
 
                osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
@@ -1616,8 +1525,7 @@ static int ceph_zero_partial_object(struct inode *inode,
                                        ceph_vino(inode),
                                        offset, length,
                                        0, 1, op,
-                                       CEPH_OSD_FLAG_WRITE |
-                                       CEPH_OSD_FLAG_ONDISK,
+                                       CEPH_OSD_FLAG_WRITE,
                                        NULL, 0, 0, false);
        if (IS_ERR(req)) {
                ret = PTR_ERR(req);
index 5e659d054b40ae6faac23af26c5321c5af6ff69b..fd8f771f99b7d7c0943170df1003a1c78e423af0 100644 (file)
@@ -499,7 +499,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_rdcache_gen = 0;
        ci->i_rdcache_revoking = 0;
 
-       INIT_LIST_HEAD(&ci->i_unsafe_writes);
        INIT_LIST_HEAD(&ci->i_unsafe_dirops);
        INIT_LIST_HEAD(&ci->i_unsafe_iops);
        spin_lock_init(&ci->i_unsafe_lock);
@@ -583,14 +582,6 @@ int ceph_drop_inode(struct inode *inode)
        return 1;
 }
 
-void ceph_evict_inode(struct inode *inode)
-{
-       /* wait unsafe sync writes */
-       ceph_sync_write_wait(inode);
-       truncate_inode_pages_final(&inode->i_data);
-       clear_inode(inode);
-}
-
 static inline blkcnt_t calc_inode_blocks(u64 size)
 {
        return (size + (1<<9) - 1) >> 9;
@@ -1016,7 +1007,9 @@ out:
 static void update_dentry_lease(struct dentry *dentry,
                                struct ceph_mds_reply_lease *lease,
                                struct ceph_mds_session *session,
-                               unsigned long from_time)
+                               unsigned long from_time,
+                               struct ceph_vino *tgt_vino,
+                               struct ceph_vino *dir_vino)
 {
        struct ceph_dentry_info *di = ceph_dentry(dentry);
        long unsigned duration = le32_to_cpu(lease->duration_ms);
@@ -1024,13 +1017,27 @@ static void update_dentry_lease(struct dentry *dentry,
        long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
        struct inode *dir;
 
+       /*
+        * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
+        * we expect a negative dentry.
+        */
+       if (!tgt_vino && d_really_is_positive(dentry))
+               return;
+
+       if (tgt_vino && (d_really_is_negative(dentry) ||
+                       !ceph_ino_compare(d_inode(dentry), tgt_vino)))
+               return;
+
        spin_lock(&dentry->d_lock);
        dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
             dentry, duration, ttl);
 
-       /* make lease_rdcache_gen match directory */
        dir = d_inode(dentry->d_parent);
 
+       /* make sure parent matches dir_vino */
+       if (!ceph_ino_compare(dir, dir_vino))
+               goto out_unlock;
+
        /* only track leases on regular dentries */
        if (ceph_snap(dir) != CEPH_NOSNAP)
                goto out_unlock;
@@ -1108,61 +1115,27 @@ out:
  *
  * Called with snap_rwsem (read).
  */
-int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
-                   struct ceph_mds_session *session)
+int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
 {
+       struct ceph_mds_session *session = req->r_session;
        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
        struct inode *in = NULL;
-       struct ceph_vino vino;
+       struct ceph_vino tvino, dvino;
        struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
        int err = 0;
 
        dout("fill_trace %p is_dentry %d is_target %d\n", req,
             rinfo->head->is_dentry, rinfo->head->is_target);
 
-#if 0
-       /*
-        * Debugging hook:
-        *
-        * If we resend completed ops to a recovering mds, we get no
-        * trace.  Since that is very rare, pretend this is the case
-        * to ensure the 'no trace' handlers in the callers behave.
-        *
-        * Fill in inodes unconditionally to avoid breaking cap
-        * invariants.
-        */
-       if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
-               pr_info("fill_trace faking empty trace on %lld %s\n",
-                       req->r_tid, ceph_mds_op_name(rinfo->head->op));
-               if (rinfo->head->is_dentry) {
-                       rinfo->head->is_dentry = 0;
-                       err = fill_inode(req->r_locked_dir,
-                                        &rinfo->diri, rinfo->dirfrag,
-                                        session, req->r_request_started, -1);
-               }
-               if (rinfo->head->is_target) {
-                       rinfo->head->is_target = 0;
-                       ininfo = rinfo->targeti.in;
-                       vino.ino = le64_to_cpu(ininfo->ino);
-                       vino.snap = le64_to_cpu(ininfo->snapid);
-                       in = ceph_get_inode(sb, vino);
-                       err = fill_inode(in, &rinfo->targeti, NULL,
-                                        session, req->r_request_started,
-                                        req->r_fmode);
-                       iput(in);
-               }
-       }
-#endif
-
        if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
                dout("fill_trace reply is empty!\n");
-               if (rinfo->head->result == 0 && req->r_locked_dir)
+               if (rinfo->head->result == 0 && req->r_parent)
                        ceph_invalidate_dir_request(req);
                return 0;
        }
 
        if (rinfo->head->is_dentry) {
-               struct inode *dir = req->r_locked_dir;
+               struct inode *dir = req->r_parent;
 
                if (dir) {
                        err = fill_inode(dir, NULL,
@@ -1188,8 +1161,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                        dname.name = rinfo->dname;
                        dname.len = rinfo->dname_len;
                        dname.hash = full_name_hash(parent, dname.name, dname.len);
-                       vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
-                       vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+                       tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+                       tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
 retry_lookup:
                        dn = d_lookup(parent, &dname);
                        dout("d_lookup on parent=%p name=%.*s got %p\n",
@@ -1206,8 +1179,8 @@ retry_lookup:
                                }
                                err = 0;
                        } else if (d_really_is_positive(dn) &&
-                                  (ceph_ino(d_inode(dn)) != vino.ino ||
-                                   ceph_snap(d_inode(dn)) != vino.snap)) {
+                                  (ceph_ino(d_inode(dn)) != tvino.ino ||
+                                   ceph_snap(d_inode(dn)) != tvino.snap)) {
                                dout(" dn %p points to wrong inode %p\n",
                                     dn, d_inode(dn));
                                d_delete(dn);
@@ -1221,10 +1194,10 @@ retry_lookup:
        }
 
        if (rinfo->head->is_target) {
-               vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
-               vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+               tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+               tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
 
-               in = ceph_get_inode(sb, vino);
+               in = ceph_get_inode(sb, tvino);
                if (IS_ERR(in)) {
                        err = PTR_ERR(in);
                        goto done;
@@ -1233,8 +1206,8 @@ retry_lookup:
 
                err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
                                session, req->r_request_started,
-                               (!req->r_aborted && rinfo->head->result == 0) ?
-                               req->r_fmode : -1,
+                               (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+                               rinfo->head->result == 0) ?  req->r_fmode : -1,
                                &req->r_caps_reservation);
                if (err < 0) {
                        pr_err("fill_inode badness %p %llx.%llx\n",
@@ -1247,8 +1220,9 @@ retry_lookup:
         * ignore null lease/binding on snapdir ENOENT, or else we
         * will have trouble splicing in the virtual snapdir later
         */
-       if (rinfo->head->is_dentry && !req->r_aborted &&
-           req->r_locked_dir &&
+       if (rinfo->head->is_dentry &&
+            !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+           test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
            (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
                                               fsc->mount_options->snapdir_name,
                                               req->r_dentry->d_name.len))) {
@@ -1257,17 +1231,19 @@ retry_lookup:
                 * mknod symlink mkdir  : null -> new inode
                 * unlink               : linked -> null
                 */
-               struct inode *dir = req->r_locked_dir;
+               struct inode *dir = req->r_parent;
                struct dentry *dn = req->r_dentry;
                bool have_dir_cap, have_lease;
 
                BUG_ON(!dn);
                BUG_ON(!dir);
                BUG_ON(d_inode(dn->d_parent) != dir);
-               BUG_ON(ceph_ino(dir) !=
-                      le64_to_cpu(rinfo->diri.in->ino));
-               BUG_ON(ceph_snap(dir) !=
-                      le64_to_cpu(rinfo->diri.in->snapid));
+
+               dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
+               dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+               BUG_ON(ceph_ino(dir) != dvino.ino);
+               BUG_ON(ceph_snap(dir) != dvino.snap);
 
                /* do we have a lease on the whole dir? */
                have_dir_cap =
@@ -1319,12 +1295,13 @@ retry_lookup:
                                ceph_dir_clear_ordered(dir);
                                dout("d_delete %p\n", dn);
                                d_delete(dn);
-                       } else {
-                               if (have_lease && d_unhashed(dn))
+                       } else if (have_lease) {
+                               if (d_unhashed(dn))
                                        d_add(dn, NULL);
                                update_dentry_lease(dn, rinfo->dlease,
                                                    session,
-                                                   req->r_request_started);
+                                                   req->r_request_started,
+                                                   NULL, &dvino);
                        }
                        goto done;
                }
@@ -1347,15 +1324,19 @@ retry_lookup:
                        have_lease = false;
                }
 
-               if (have_lease)
+               if (have_lease) {
+                       tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+                       tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
                        update_dentry_lease(dn, rinfo->dlease, session,
-                                           req->r_request_started);
+                                           req->r_request_started,
+                                           &tvino, &dvino);
+               }
                dout(" final dn %p\n", dn);
-       } else if (!req->r_aborted &&
-                  (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
-                   req->r_op == CEPH_MDS_OP_MKSNAP)) {
+       } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+                   req->r_op == CEPH_MDS_OP_MKSNAP) &&
+                  !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
                struct dentry *dn = req->r_dentry;
-               struct inode *dir = req->r_locked_dir;
+               struct inode *dir = req->r_parent;
 
                /* fill out a snapdir LOOKUPSNAP dentry */
                BUG_ON(!dn);
@@ -1370,6 +1351,26 @@ retry_lookup:
                        goto done;
                }
                req->r_dentry = dn;  /* may have spliced */
+       } else if (rinfo->head->is_dentry) {
+               struct ceph_vino *ptvino = NULL;
+
+               if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) ||
+                   le32_to_cpu(rinfo->dlease->duration_ms)) {
+                       dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
+                       dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+                       if (rinfo->head->is_target) {
+                               tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+                               tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+                               ptvino = &tvino;
+                       }
+
+                       update_dentry_lease(req->r_dentry, rinfo->dlease,
+                               session, req->r_request_started, ptvino,
+                               &dvino);
+               } else {
+                       dout("%s: no dentry lease or dir cap\n", __func__);
+               }
        }
 done:
        dout("fill_trace done err=%d\n", err);
@@ -1478,7 +1479,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        u32 fpos_offset;
        struct ceph_readdir_cache_control cache_ctl = {};
 
-       if (req->r_aborted)
+       if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
                return readdir_prepopulate_inodes_only(req, session);
 
        if (rinfo->hash_order && req->r_path2) {
@@ -1523,14 +1524,14 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        /* FIXME: release caps/leases if error occurs */
        for (i = 0; i < rinfo->dir_nr; i++) {
                struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
-               struct ceph_vino vino;
+               struct ceph_vino tvino, dvino;
 
                dname.name = rde->name;
                dname.len = rde->name_len;
                dname.hash = full_name_hash(parent, dname.name, dname.len);
 
-               vino.ino = le64_to_cpu(rde->inode.in->ino);
-               vino.snap = le64_to_cpu(rde->inode.in->snapid);
+               tvino.ino = le64_to_cpu(rde->inode.in->ino);
+               tvino.snap = le64_to_cpu(rde->inode.in->snapid);
 
                if (rinfo->hash_order) {
                        u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
@@ -1559,8 +1560,8 @@ retry_lookup:
                                goto out;
                        }
                } else if (d_really_is_positive(dn) &&
-                          (ceph_ino(d_inode(dn)) != vino.ino ||
-                           ceph_snap(d_inode(dn)) != vino.snap)) {
+                          (ceph_ino(d_inode(dn)) != tvino.ino ||
+                           ceph_snap(d_inode(dn)) != tvino.snap)) {
                        dout(" dn %p points to wrong inode %p\n",
                             dn, d_inode(dn));
                        d_delete(dn);
@@ -1572,7 +1573,7 @@ retry_lookup:
                if (d_really_is_positive(dn)) {
                        in = d_inode(dn);
                } else {
-                       in = ceph_get_inode(parent->d_sb, vino);
+                       in = ceph_get_inode(parent->d_sb, tvino);
                        if (IS_ERR(in)) {
                                dout("new_inode badness\n");
                                d_drop(dn);
@@ -1617,8 +1618,9 @@ retry_lookup:
 
                ceph_dentry(dn)->offset = rde->offset;
 
+               dvino = ceph_vino(d_inode(parent));
                update_dentry_lease(dn, rde->lease, req->r_session,
-                                   req->r_request_started);
+                                   req->r_request_started, &tvino, &dvino);
 
                if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
                        ret = fill_readdir_cache(d_inode(parent), dn,
@@ -1632,7 +1634,7 @@ next_item:
        }
 out:
        if (err == 0 && skipped == 0) {
-               req->r_did_prepopulate = true;
+               set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags);
                req->r_readdir_cache_idx = cache_ctl.index;
        }
        ceph_readdir_cache_release(&cache_ctl);
@@ -1720,7 +1722,7 @@ static void ceph_invalidate_work(struct work_struct *work)
 
        mutex_lock(&ci->i_truncate_mutex);
 
-       if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+       if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
                pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
                                    inode, ceph_ino(inode));
                mapping_set_error(inode->i_mapping, -EIO);
index 7d752d53353a24e742fff660ab33f2435727f67e..4c9c72f26eb90c6fd3693dc8f6ad7e9eda458ca3 100644 (file)
@@ -25,7 +25,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
                l.stripe_count = ci->i_layout.stripe_count;
                l.object_size = ci->i_layout.object_size;
                l.data_pool = ci->i_layout.pool_id;
-               l.preferred_osd = (s32)-1;
+               l.preferred_osd = -1;
                if (copy_to_user(arg, &l, sizeof(l)))
                        return -EFAULT;
        }
@@ -97,7 +97,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
                nl.data_pool = ci->i_layout.pool_id;
 
        /* this is obsolete, and always -1 */
-       nl.preferred_osd = le64_to_cpu(-1);
+       nl.preferred_osd = -1;
 
        err = __validate_layout(mdsc, &nl);
        if (err)
index c9d2e553a6c487f01bd11ed4c7a2c15ddfcd058d..c681762d76e66be1edf7004b1de8c13568ec6022 100644 (file)
@@ -547,8 +547,8 @@ void ceph_mdsc_release_request(struct kref *kref)
                ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
                iput(req->r_inode);
        }
-       if (req->r_locked_dir)
-               ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+       if (req->r_parent)
+               ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
        iput(req->r_target_inode);
        if (req->r_dentry)
                dput(req->r_dentry);
@@ -628,6 +628,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 {
        dout("__unregister_request %p tid %lld\n", req, req->r_tid);
 
+       /* Never leave an unregistered request on an unsafe list! */
+       list_del_init(&req->r_unsafe_item);
+
        if (req->r_tid == mdsc->oldest_tid) {
                struct rb_node *p = rb_next(&req->r_node);
                mdsc->oldest_tid = 0;
@@ -644,13 +647,15 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 
        erase_request(&mdsc->request_tree, req);
 
-       if (req->r_unsafe_dir && req->r_got_unsafe) {
+       if (req->r_unsafe_dir  &&
+           test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
                struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
                spin_lock(&ci->i_unsafe_lock);
                list_del_init(&req->r_unsafe_dir_item);
                spin_unlock(&ci->i_unsafe_lock);
        }
-       if (req->r_target_inode && req->r_got_unsafe) {
+       if (req->r_target_inode &&
+           test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
                struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
                spin_lock(&ci->i_unsafe_lock);
                list_del_init(&req->r_unsafe_target_item);
@@ -667,6 +672,28 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
        ceph_mdsc_put_request(req);
 }
 
+/*
+ * Walk back up the dentry tree until we hit a dentry representing a
+ * non-snapshot inode. We do this using the rcu_read_lock (which must be held
+ * when calling this) to ensure that the objects won't disappear while we're
+ * working with them. Once we hit a candidate dentry, we attempt to take a
+ * reference to it, and return that as the result.
+ */
+static struct inode *get_nonsnap_parent(struct dentry *dentry)
+{
+       struct inode *inode = NULL;
+
+       while (dentry && !IS_ROOT(dentry)) {
+               inode = d_inode_rcu(dentry);
+               if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
+                       break;
+               dentry = dentry->d_parent;
+       }
+       if (inode)
+               inode = igrab(inode);
+       return inode;
+}
+
 /*
  * Choose mds to send request to next.  If there is a hint set in the
  * request (e.g., due to a prior forward hint from the mds), use that.
@@ -675,19 +702,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
  *
  * Called under mdsc->mutex.
  */
-static struct dentry *get_nonsnap_parent(struct dentry *dentry)
-{
-       /*
-        * we don't need to worry about protecting the d_parent access
-        * here because we never renaming inside the snapped namespace
-        * except to resplice to another snapdir, and either the old or new
-        * result is a valid result.
-        */
-       while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
-               dentry = dentry->d_parent;
-       return dentry;
-}
-
 static int __choose_mds(struct ceph_mds_client *mdsc,
                        struct ceph_mds_request *req)
 {
@@ -697,7 +711,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
        int mode = req->r_direct_mode;
        int mds = -1;
        u32 hash = req->r_direct_hash;
-       bool is_hash = req->r_direct_is_hash;
+       bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
 
        /*
         * is there a specific mds we should try?  ignore hint if we have
@@ -717,30 +731,39 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
        inode = NULL;
        if (req->r_inode) {
                inode = req->r_inode;
+               ihold(inode);
        } else if (req->r_dentry) {
                /* ignore race with rename; old or new d_parent is okay */
-               struct dentry *parent = req->r_dentry->d_parent;
-               struct inode *dir = d_inode(parent);
+               struct dentry *parent;
+               struct inode *dir;
+
+               rcu_read_lock();
+               parent = req->r_dentry->d_parent;
+               dir = req->r_parent ? : d_inode_rcu(parent);
 
-               if (dir->i_sb != mdsc->fsc->sb) {
-                       /* not this fs! */
+               if (!dir || dir->i_sb != mdsc->fsc->sb) {
+                       /*  not this fs or parent went negative */
                        inode = d_inode(req->r_dentry);
+                       if (inode)
+                               ihold(inode);
                } else if (ceph_snap(dir) != CEPH_NOSNAP) {
                        /* direct snapped/virtual snapdir requests
                         * based on parent dir inode */
-                       struct dentry *dn = get_nonsnap_parent(parent);
-                       inode = d_inode(dn);
+                       inode = get_nonsnap_parent(parent);
                        dout("__choose_mds using nonsnap parent %p\n", inode);
                } else {
                        /* dentry target */
                        inode = d_inode(req->r_dentry);
                        if (!inode || mode == USE_AUTH_MDS) {
                                /* dir + name */
-                               inode = dir;
+                               inode = igrab(dir);
                                hash = ceph_dentry_hash(dir, req->r_dentry);
                                is_hash = true;
+                       } else {
+                               ihold(inode);
                        }
                }
+               rcu_read_unlock();
        }
 
        dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
@@ -769,7 +792,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                                     (int)r, frag.ndist);
                                if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
                                    CEPH_MDS_STATE_ACTIVE)
-                                       return mds;
+                                       goto out;
                        }
 
                        /* since this file/dir wasn't known to be
@@ -784,7 +807,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                                     inode, ceph_vinop(inode), frag.frag, mds);
                                if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
                                    CEPH_MDS_STATE_ACTIVE)
-                                       return mds;
+                                       goto out;
                        }
                }
        }
@@ -797,6 +820,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
        if (!cap) {
                spin_unlock(&ci->i_ceph_lock);
+               iput(inode);
                goto random;
        }
        mds = cap->session->s_mds;
@@ -804,6 +828,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
             inode, ceph_vinop(inode), mds,
             cap == ci->i_auth_cap ? "auth " : "", cap);
        spin_unlock(&ci->i_ceph_lock);
+out:
+       iput(inode);
        return mds;
 
 random:
@@ -1036,7 +1062,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
        while (!list_empty(&session->s_unsafe)) {
                req = list_first_entry(&session->s_unsafe,
                                       struct ceph_mds_request, r_unsafe_item);
-               list_del_init(&req->r_unsafe_item);
                pr_warn_ratelimited(" dropping unsafe request %llu\n",
                                    req->r_tid);
                __unregister_request(mdsc, req);
@@ -1146,7 +1171,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
 
                if (ci->i_wrbuffer_ref > 0 &&
-                   ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+                   READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
                        invalidate = true;
 
                while (!list_empty(&ci->i_cap_flush_list)) {
@@ -1775,18 +1800,23 @@ retry:
        return path;
 }
 
-static int build_dentry_path(struct dentry *dentry,
+static int build_dentry_path(struct dentry *dentry, struct inode *dir,
                             const char **ppath, int *ppathlen, u64 *pino,
                             int *pfreepath)
 {
        char *path;
 
-       if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) {
-               *pino = ceph_ino(d_inode(dentry->d_parent));
+       rcu_read_lock();
+       if (!dir)
+               dir = d_inode_rcu(dentry->d_parent);
+       if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
+               *pino = ceph_ino(dir);
+               rcu_read_unlock();
                *ppath = dentry->d_name.name;
                *ppathlen = dentry->d_name.len;
                return 0;
        }
+       rcu_read_unlock();
        path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
        if (IS_ERR(path))
                return PTR_ERR(path);
@@ -1822,8 +1852,8 @@ static int build_inode_path(struct inode *inode,
  * an explicit ino+path.
  */
 static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
-                                 const char *rpath, u64 rino,
-                                 const char **ppath, int *pathlen,
+                                 struct inode *rdiri, const char *rpath,
+                                 u64 rino, const char **ppath, int *pathlen,
                                  u64 *ino, int *freepath)
 {
        int r = 0;
@@ -1833,7 +1863,8 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
                dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
                     ceph_snap(rinode));
        } else if (rdentry) {
-               r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
+               r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
+                                       freepath);
                dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
                     *ppath);
        } else if (rpath || rino) {
@@ -1866,7 +1897,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        int ret;
 
        ret = set_request_path_attr(req->r_inode, req->r_dentry,
-                             req->r_path1, req->r_ino1.ino,
+                             req->r_parent, req->r_path1, req->r_ino1.ino,
                              &path1, &pathlen1, &ino1, &freepath1);
        if (ret < 0) {
                msg = ERR_PTR(ret);
@@ -1874,6 +1905,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        }
 
        ret = set_request_path_attr(NULL, req->r_old_dentry,
+                             req->r_old_dentry_dir,
                              req->r_path2, req->r_ino2.ino,
                              &path2, &pathlen2, &ino2, &freepath2);
        if (ret < 0) {
@@ -1927,10 +1959,13 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
                      mds, req->r_inode_drop, req->r_inode_unless, 0);
        if (req->r_dentry_drop)
                releases += ceph_encode_dentry_release(&p, req->r_dentry,
-                      mds, req->r_dentry_drop, req->r_dentry_unless);
+                               req->r_parent, mds, req->r_dentry_drop,
+                               req->r_dentry_unless);
        if (req->r_old_dentry_drop)
                releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
-                      mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
+                               req->r_old_dentry_dir, mds,
+                               req->r_old_dentry_drop,
+                               req->r_old_dentry_unless);
        if (req->r_old_inode_drop)
                releases += ceph_encode_inode_release(&p,
                      d_inode(req->r_old_dentry),
@@ -2012,7 +2047,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
             req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
 
-       if (req->r_got_unsafe) {
+       if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
                void *p;
                /*
                 * Replay.  Do not regenerate message (and rebuild
@@ -2061,16 +2096,16 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 
        rhead = msg->front.iov_base;
        rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
-       if (req->r_got_unsafe)
+       if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
                flags |= CEPH_MDS_FLAG_REPLAY;
-       if (req->r_locked_dir)
+       if (req->r_parent)
                flags |= CEPH_MDS_FLAG_WANT_DENTRY;
        rhead->flags = cpu_to_le32(flags);
        rhead->num_fwd = req->r_num_fwd;
        rhead->num_retry = req->r_attempts - 1;
        rhead->ino = 0;
 
-       dout(" r_locked_dir = %p\n", req->r_locked_dir);
+       dout(" r_parent = %p\n", req->r_parent);
        return 0;
 }
 
@@ -2084,8 +2119,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
        int mds = -1;
        int err = 0;
 
-       if (req->r_err || req->r_got_result) {
-               if (req->r_aborted)
+       if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
+               if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
                        __unregister_request(mdsc, req);
                goto out;
        }
@@ -2096,12 +2131,12 @@ static int __do_request(struct ceph_mds_client *mdsc,
                err = -EIO;
                goto finish;
        }
-       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+       if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
                dout("do_request forced umount\n");
                err = -EIO;
                goto finish;
        }
-       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
+       if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
                if (mdsc->mdsmap_err) {
                        err = mdsc->mdsmap_err;
                        dout("do_request mdsmap err %d\n", err);
@@ -2215,7 +2250,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
        while (p) {
                req = rb_entry(p, struct ceph_mds_request, r_node);
                p = rb_next(p);
-               if (req->r_got_unsafe)
+               if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
                        continue;
                if (req->r_attempts > 0)
                        continue; /* only new requests */
@@ -2250,11 +2285,11 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 
        dout("do_request on %p\n", req);
 
-       /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
+       /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
        if (req->r_inode)
                ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
-       if (req->r_locked_dir)
-               ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+       if (req->r_parent)
+               ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
        if (req->r_old_dentry_dir)
                ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
                                  CEPH_CAP_PIN);
@@ -2289,7 +2324,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
        mutex_lock(&mdsc->mutex);
 
        /* only abort if we didn't race with a real reply */
-       if (req->r_got_result) {
+       if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
                err = le32_to_cpu(req->r_reply_info.head->result);
        } else if (err < 0) {
                dout("aborted request %lld with %d\n", req->r_tid, err);
@@ -2301,10 +2336,10 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
                 */
                mutex_lock(&req->r_fill_mutex);
                req->r_err = err;
-               req->r_aborted = true;
+               set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
                mutex_unlock(&req->r_fill_mutex);
 
-               if (req->r_locked_dir &&
+               if (req->r_parent &&
                    (req->r_op & CEPH_MDS_OP_WRITE))
                        ceph_invalidate_dir_request(req);
        } else {
@@ -2323,7 +2358,7 @@ out:
  */
 void ceph_invalidate_dir_request(struct ceph_mds_request *req)
 {
-       struct inode *inode = req->r_locked_dir;
+       struct inode *inode = req->r_parent;
 
        dout("invalidate_dir_request %p (complete, lease(s))\n", inode);
 
@@ -2379,14 +2414,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        }
 
        /* dup? */
-       if ((req->r_got_unsafe && !head->safe) ||
-           (req->r_got_safe && head->safe)) {
+       if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||
+           (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
                pr_warn("got a dup %s reply on %llu from mds%d\n",
                           head->safe ? "safe" : "unsafe", tid, mds);
                mutex_unlock(&mdsc->mutex);
                goto out;
        }
-       if (req->r_got_safe) {
+       if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
                pr_warn("got unsafe after safe on %llu from mds%d\n",
                           tid, mds);
                mutex_unlock(&mdsc->mutex);
@@ -2425,10 +2460,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 
 
        if (head->safe) {
-               req->r_got_safe = true;
+               set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
                __unregister_request(mdsc, req);
 
-               if (req->r_got_unsafe) {
+               if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
                        /*
                         * We already handled the unsafe response, now do the
                         * cleanup.  No need to examine the response; the MDS
@@ -2437,7 +2472,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                         * useful we could do with a revised return value.
                         */
                        dout("got safe reply %llu, mds%d\n", tid, mds);
-                       list_del_init(&req->r_unsafe_item);
 
                        /* last unsafe request during umount? */
                        if (mdsc->stopping && !__get_oldest_req(mdsc))
@@ -2446,7 +2480,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                        goto out;
                }
        } else {
-               req->r_got_unsafe = true;
+               set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
                list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
                if (req->r_unsafe_dir) {
                        struct ceph_inode_info *ci =
@@ -2486,7 +2520,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        /* insert trace into our cache */
        mutex_lock(&req->r_fill_mutex);
        current->journal_info = req;
-       err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
+       err = ceph_fill_trace(mdsc->fsc->sb, req);
        if (err == 0) {
                if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
                                    req->r_op == CEPH_MDS_OP_LSSNAP))
@@ -2500,7 +2534,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        if (realm)
                ceph_put_snap_realm(mdsc, realm);
 
-       if (err == 0 && req->r_got_unsafe && req->r_target_inode) {
+       if (err == 0 && req->r_target_inode &&
+           test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
                struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
                spin_lock(&ci->i_unsafe_lock);
                list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
@@ -2508,12 +2543,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        }
 out_err:
        mutex_lock(&mdsc->mutex);
-       if (!req->r_aborted) {
+       if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
                if (err) {
                        req->r_err = err;
                } else {
                        req->r_reply =  ceph_msg_get(msg);
-                       req->r_got_result = true;
+                       set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
                }
        } else {
                dout("reply arrived after request %lld was aborted\n", tid);
@@ -2557,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
                goto out;  /* dup reply? */
        }
 
-       if (req->r_aborted) {
+       if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
                dout("forward tid %llu aborted, unregistering\n", tid);
                __unregister_request(mdsc, req);
        } else if (fwd_seq <= req->r_num_fwd) {
@@ -2567,7 +2602,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
                /* resend. forward race not possible; mds would drop */
                dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
                BUG_ON(req->r_err);
-               BUG_ON(req->r_got_result);
+               BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
                req->r_attempts = 0;
                req->r_num_fwd = fwd_seq;
                req->r_resend_mds = next_mds;
@@ -2732,7 +2767,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
        while (p) {
                req = rb_entry(p, struct ceph_mds_request, r_node);
                p = rb_next(p);
-               if (req->r_got_unsafe)
+               if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
                        continue;
                if (req->r_attempts == 0)
                        continue; /* only old requests */
@@ -3556,7 +3591,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
        u64 want_tid, want_flush;
 
-       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+       if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
                return;
 
        dout("sync\n");
@@ -3587,7 +3622,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
  */
 static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
 {
-       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+       if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
                return true;
        return atomic_read(&mdsc->num_sessions) <= skipped;
 }
index 3c6f77b7bb02107f9edc579fd5dc12ce57c8b7f5..ac0475a2daa749d3d689956cc45a2913c955ca8f 100644 (file)
@@ -202,9 +202,18 @@ struct ceph_mds_request {
        char *r_path1, *r_path2;
        struct ceph_vino r_ino1, r_ino2;
 
-       struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
+       struct inode *r_parent;             /* parent dir inode */
        struct inode *r_target_inode;       /* resulting inode */
 
+#define CEPH_MDS_R_DIRECT_IS_HASH      (1) /* r_direct_hash is valid */
+#define CEPH_MDS_R_ABORTED             (2) /* call was aborted */
+#define CEPH_MDS_R_GOT_UNSAFE          (3) /* got an unsafe reply */
+#define CEPH_MDS_R_GOT_SAFE            (4) /* got a safe reply */
+#define CEPH_MDS_R_GOT_RESULT          (5) /* got a result */
+#define CEPH_MDS_R_DID_PREPOPULATE     (6) /* prepopulated readdir */
+#define CEPH_MDS_R_PARENT_LOCKED       (7) /* is r_parent->i_rwsem wlocked? */
+       unsigned long   r_req_flags;
+
        struct mutex r_fill_mutex;
 
        union ceph_mds_request_args r_args;
@@ -216,7 +225,6 @@ struct ceph_mds_request {
        /* for choosing which mds to send this request to */
        int r_direct_mode;
        u32 r_direct_hash;      /* choose dir frag based on this dentry hash */
-       bool r_direct_is_hash;  /* true if r_direct_hash is valid */
 
        /* data payload is used for xattr ops */
        struct ceph_pagelist *r_pagelist;
@@ -234,7 +242,6 @@ struct ceph_mds_request {
        struct ceph_mds_reply_info_parsed r_reply_info;
        struct page *r_locked_page;
        int r_err;
-       bool r_aborted;
 
        unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
        unsigned long r_started;  /* start time to measure timeout against */
@@ -262,9 +269,7 @@ struct ceph_mds_request {
        ceph_mds_request_callback_t r_callback;
        ceph_mds_request_wait_callback_t r_wait_for_completion;
        struct list_head  r_unsafe_item;  /* per-session unsafe list item */
-       bool              r_got_unsafe, r_got_safe, r_got_result;
 
-       bool              r_did_prepopulate;
        long long         r_dir_release_cnt;
        long long         r_dir_ordered_cnt;
        int               r_readdir_cache_idx;
index 6bd20d707bfd885aff2f89a4b7266cc1c05fd5c8..0ec8d0114e57ba80fdc46b1acdc9b7de7373e276 100644 (file)
@@ -757,7 +757,6 @@ static const struct super_operations ceph_super_ops = {
        .destroy_inode  = ceph_destroy_inode,
        .write_inode    = ceph_write_inode,
        .drop_inode     = ceph_drop_inode,
-       .evict_inode    = ceph_evict_inode,
        .sync_fs        = ceph_sync_fs,
        .put_super      = ceph_put_super,
        .show_options   = ceph_show_options,
@@ -952,6 +951,14 @@ static int ceph_register_bdi(struct super_block *sb,
                fsc->backing_dev_info.ra_pages =
                        VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
 
+       if (fsc->mount_options->rsize > fsc->mount_options->rasize &&
+           fsc->mount_options->rsize >= PAGE_SIZE)
+               fsc->backing_dev_info.io_pages =
+                       (fsc->mount_options->rsize + PAGE_SIZE - 1)
+                       >> PAGE_SHIFT;
+       else if (fsc->mount_options->rsize == 0)
+               fsc->backing_dev_info.io_pages = ULONG_MAX;
+
        err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
                           atomic_long_inc_return(&bdi_seq));
        if (!err)
index 3373b61faefd0fac7d240438e5bb2dca7e3433db..e9410bcf41135b72d6a782c9d5dbf1df29bcd911 100644 (file)
@@ -45,8 +45,8 @@
 #define ceph_test_mount_opt(fsc, opt) \
        (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
 
-#define CEPH_RSIZE_DEFAULT             0           /* max read size */
-#define CEPH_RASIZE_DEFAULT            (8192*1024) /* readahead */
+#define CEPH_RSIZE_DEFAULT              (64*1024*1024) /* max read size */
+#define CEPH_RASIZE_DEFAULT             (8192*1024)    /* max readahead */
 #define CEPH_MAX_READDIR_DEFAULT        1024
 #define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
 #define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
@@ -343,7 +343,6 @@ struct ceph_inode_info {
        u32 i_rdcache_gen;      /* incremented each time we get FILE_CACHE. */
        u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
 
-       struct list_head i_unsafe_writes; /* uncommitted sync writes */
        struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
        struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
        spinlock_t i_unsafe_lock;
@@ -602,7 +601,7 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
 }
 
 /* what the mds thinks we want */
-extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
 
 extern void ceph_caps_init(struct ceph_mds_client *mdsc);
 extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
@@ -753,7 +752,6 @@ extern const struct inode_operations ceph_file_iops;
 extern struct inode *ceph_alloc_inode(struct super_block *sb);
 extern void ceph_destroy_inode(struct inode *inode);
 extern int ceph_drop_inode(struct inode *inode);
-extern void ceph_evict_inode(struct inode *inode);
 
 extern struct inode *ceph_get_inode(struct super_block *sb,
                                    struct ceph_vino vino);
@@ -764,8 +762,7 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
                                u64 time_warp_seq, struct timespec *ctime,
                                struct timespec *mtime, struct timespec *atime);
 extern int ceph_fill_trace(struct super_block *sb,
-                          struct ceph_mds_request *req,
-                          struct ceph_mds_session *session);
+                          struct ceph_mds_request *req);
 extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                                    struct ceph_mds_session *session);
 
@@ -904,6 +901,7 @@ extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
 extern int ceph_encode_inode_release(void **p, struct inode *inode,
                                     int mds, int drop, int unless, int force);
 extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
+                                     struct inode *dir,
                                      int mds, int drop, int unless);
 
 extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
@@ -933,7 +931,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 extern int ceph_release(struct inode *inode, struct file *filp);
 extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
                                  char *data, size_t len);
-extern void ceph_sync_write_wait(struct inode *inode);
+
 /* dir.c */
 extern const struct file_operations ceph_dir_fops;
 extern const struct file_operations ceph_snapdir_fops;
index 03a6653d329a01b90803d373f7a955230d787675..2ea0c282f3dc9326f7b3c4b7a3883758831ed251 100644 (file)
@@ -22,7 +22,6 @@ struct ceph_osd_client;
  * completion callback for async writepages
  */
 typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
-typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
 
 #define CEPH_HOMELESS_OSD      -1
 
@@ -170,15 +169,12 @@ struct ceph_osd_request {
        unsigned int            r_num_ops;
 
        int               r_result;
-       bool              r_got_reply;
 
        struct ceph_osd_client *r_osdc;
        struct kref       r_kref;
        bool              r_mempool;
-       struct completion r_completion;
-       struct completion r_done_completion;  /* fsync waiter */
+       struct completion r_completion;       /* private to osd_client.c */
        ceph_osdc_callback_t r_callback;
-       ceph_osdc_unsafe_callback_t r_unsafe_callback;
        struct list_head  r_unsafe_item;
 
        struct inode *r_inode;                /* for use by callbacks */
index 9a9041784dcff383169a169c39203dd79f383438..938656f708078e8a2fd078bc7cace623a32b755c 100644 (file)
@@ -57,7 +57,7 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
        case CEPH_POOL_TYPE_EC:
                return false;
        default:
-               BUG_ON(1);
+               BUG();
        }
 }
 
@@ -81,13 +81,6 @@ void ceph_oloc_copy(struct ceph_object_locator *dest,
                    const struct ceph_object_locator *src);
 void ceph_oloc_destroy(struct ceph_object_locator *oloc);
 
-/*
- * Maximum supported by kernel client object name length
- *
- * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100)
- */
-#define CEPH_MAX_OID_NAME_LEN 100
-
 /*
  * 51-char inline_name is long enough for all cephfs and all but one
  * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
@@ -173,8 +166,8 @@ struct ceph_osdmap {
         * the list of osds that store+replicate them. */
        struct crush_map *crush;
 
-       struct mutex crush_scratch_mutex;
-       int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
+       struct mutex crush_workspace_mutex;
+       void *crush_workspace;
 };
 
 static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
index 5c0da61cb763124c651e3287f4ecaf69f8cdfbc8..5d0018782d504ce61fa3dff2ed526636302b7ef4 100644 (file)
@@ -50,7 +50,7 @@ struct ceph_timespec {
 #define CEPH_PG_LAYOUT_LINEAR 2
 #define CEPH_PG_LAYOUT_HYBRID 3
 
-#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
+#define CEPH_PG_MAX_SIZE      32  /* max # osds in a single pg */
 
 /*
  * placement group.
index be8f12b8f1950499380c10de27ab6928df25fd8e..fbecbd089d75f4a9eead25b1439e6629893c3639 100644 (file)
@@ -135,13 +135,6 @@ struct crush_bucket {
        __u32 size;      /* num items */
        __s32 *items;
 
-       /*
-        * cached random permutation: used for uniform bucket and for
-        * the linear search fallback for the other bucket types.
-        */
-       __u32 perm_x;  /* @x for which *perm is defined */
-       __u32 perm_n;  /* num elements of *perm that are permuted/defined */
-       __u32 *perm;
 };
 
 struct crush_bucket_uniform {
@@ -211,6 +204,21 @@ struct crush_map {
         * device fails. */
        __u8 chooseleaf_stable;
 
+       /*
+        * This value is calculated after decode or construction by
+        * the builder. It is exposed here (rather than having a
+        * 'build CRUSH working space' function) so that callers can
+        * reserve a static buffer, allocate space on the stack, or
+        * otherwise avoid calling into the heap allocator if they
+        * want to. The size of the working space depends on the map,
+        * while the size of the scratch vector passed to the mapper
+        * depends on the size of the desired result set.
+        *
+        * Nothing stops the caller from allocating both in one swell
+        * foop and passing in two points, though.
+        */
+       size_t working_size;
+
 #ifndef __KERNEL__
        /*
         * version 0 (original) of straw_calc has various flaws.  version 1
@@ -248,4 +256,23 @@ static inline int crush_calc_tree_node(int i)
        return ((i+1) << 1)-1;
 }
 
+/*
+ * These data structures are private to the CRUSH implementation. They
+ * are exposed in this header file because builder needs their
+ * definitions to calculate the total working size.
+ *
+ * Moving this out of the crush map allow us to treat the CRUSH map as
+ * immutable within the mapper and removes the requirement for a CRUSH
+ * map lock.
+ */
+struct crush_work_bucket {
+       __u32 perm_x; /* @x for which *perm is defined */
+       __u32 perm_n; /* num elements of *perm that are permuted/defined */
+       __u32 *perm;  /* Permutation of the bucket's items */
+};
+
+struct crush_work {
+       struct crush_work_bucket **work; /* Per-bucket working store */
+};
+
 #endif
index 5dfd5b1125d2b257a4a00d1e77661613ca2227ec..c95e19e1ff11c5f69e4b3d05d328ce071ef74856 100644 (file)
@@ -15,6 +15,20 @@ extern int crush_do_rule(const struct crush_map *map,
                         int ruleno,
                         int x, int *result, int result_max,
                         const __u32 *weights, int weight_max,
-                        int *scratch);
+                        void *cwin);
+
+/*
+ * Returns the exact amount of workspace that will need to be used
+ * for a given combination of crush_map and result_max. The caller can
+ * then allocate this much on its own, either on the stack, in a
+ * per-thread long-lived buffer, or however it likes.
+ */
+static inline size_t crush_work_size(const struct crush_map *map,
+                                    int result_max)
+{
+       return map->working_size + result_max * 3 * sizeof(__u32);
+}
+
+void crush_init_workspace(const struct crush_map *map, void *v);
 
 #endif
index 50f040fdb2a97f4278fce4130a0cf4bbe3c87052..b9233b9903990bd38721213ce287a8045debd8c7 100644 (file)
@@ -69,8 +69,8 @@ int ceph_cls_lock(struct ceph_osd_client *osdc,
        dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n",
             __func__, lock_name, type, cookie, tag, desc, flags);
        ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock",
-                            CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-                            lock_op_page, lock_op_buf_size, NULL, NULL);
+                            CEPH_OSD_FLAG_WRITE, lock_op_page,
+                            lock_op_buf_size, NULL, NULL);
 
        dout("%s: status %d\n", __func__, ret);
        __free_page(lock_op_page);
@@ -117,8 +117,8 @@ int ceph_cls_unlock(struct ceph_osd_client *osdc,
 
        dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie);
        ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock",
-                            CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-                            unlock_op_page, unlock_op_buf_size, NULL, NULL);
+                            CEPH_OSD_FLAG_WRITE, unlock_op_page,
+                            unlock_op_buf_size, NULL, NULL);
 
        dout("%s: status %d\n", __func__, ret);
        __free_page(unlock_op_page);
@@ -170,8 +170,8 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc,
        dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name,
             cookie, ENTITY_NAME(*locker));
        ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock",
-                            CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
-                            break_op_page, break_op_buf_size, NULL, NULL);
+                            CEPH_OSD_FLAG_WRITE, break_op_page,
+                            break_op_buf_size, NULL, NULL);
 
        dout("%s: status %d\n", __func__, ret);
        __free_page(break_op_page);
@@ -278,7 +278,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
        int get_info_op_buf_size;
        int name_len = strlen(lock_name);
        struct page *get_info_op_page, *reply_page;
-       size_t reply_len;
+       size_t reply_len = PAGE_SIZE;
        void *p, *end;
        int ret;
 
index 80d7c3a97cb84355e82e9d8f4c83fbf5b0d82893..5bf94c04f64547e2cfff79c0655bcc68944f4e12 100644 (file)
@@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
 
 void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
 {
-       kfree(b->h.perm);
        kfree(b->h.items);
        kfree(b);
 }
@@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b)
 {
        kfree(b->item_weights);
        kfree(b->sum_weights);
-       kfree(b->h.perm);
        kfree(b->h.items);
        kfree(b);
 }
 
 void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
 {
-       kfree(b->h.perm);
        kfree(b->h.items);
        kfree(b->node_weights);
        kfree(b);
@@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
 {
        kfree(b->straws);
        kfree(b->item_weights);
-       kfree(b->h.perm);
        kfree(b->h.items);
        kfree(b);
 }
@@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
 void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
 {
        kfree(b->item_weights);
-       kfree(b->h.perm);
        kfree(b->h.items);
        kfree(b);
 }
index 130ab407c5ecf8ca5c0943759efff91c7bf258e8..b5cd8c21bfdfbf4d85bd93993f1f02807c5200be 100644 (file)
@@ -54,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
        return -1;
 }
 
-
 /*
  * bucket choose methods
  *
@@ -72,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
  * Since this is expensive, we optimize for the r=0 case, which
  * captures the vast majority of calls.
  */
-static int bucket_perm_choose(struct crush_bucket *bucket,
+static int bucket_perm_choose(const struct crush_bucket *bucket,
+                             struct crush_work_bucket *work,
                              int x, int r)
 {
        unsigned int pr = r % bucket->size;
        unsigned int i, s;
 
        /* start a new permutation if @x has changed */
-       if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {
+       if (work->perm_x != (__u32)x || work->perm_n == 0) {
                dprintk("bucket %d new x=%d\n", bucket->id, x);
-               bucket->perm_x = x;
+               work->perm_x = x;
 
                /* optimize common r=0 case */
                if (pr == 0) {
                        s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
                                bucket->size;
-                       bucket->perm[0] = s;
-                       bucket->perm_n = 0xffff;   /* magic value, see below */
+                       work->perm[0] = s;
+                       work->perm_n = 0xffff;   /* magic value, see below */
                        goto out;
                }
 
                for (i = 0; i < bucket->size; i++)
-                       bucket->perm[i] = i;
-               bucket->perm_n = 0;
-       } else if (bucket->perm_n == 0xffff) {
+                       work->perm[i] = i;
+               work->perm_n = 0;
+       } else if (work->perm_n == 0xffff) {
                /* clean up after the r=0 case above */
                for (i = 1; i < bucket->size; i++)
-                       bucket->perm[i] = i;
-               bucket->perm[bucket->perm[0]] = 0;
-               bucket->perm_n = 1;
+                       work->perm[i] = i;
+               work->perm[work->perm[0]] = 0;
+               work->perm_n = 1;
        }
 
        /* calculate permutation up to pr */
-       for (i = 0; i < bucket->perm_n; i++)
-               dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
-       while (bucket->perm_n <= pr) {
-               unsigned int p = bucket->perm_n;
+       for (i = 0; i < work->perm_n; i++)
+               dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
+       while (work->perm_n <= pr) {
+               unsigned int p = work->perm_n;
                /* no point in swapping the final entry */
                if (p < bucket->size - 1) {
                        i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
                                (bucket->size - p);
                        if (i) {
-                               unsigned int t = bucket->perm[p + i];
-                               bucket->perm[p + i] = bucket->perm[p];
-                               bucket->perm[p] = t;
+                               unsigned int t = work->perm[p + i];
+                               work->perm[p + i] = work->perm[p];
+                               work->perm[p] = t;
                        }
                        dprintk(" perm_choose swap %d with %d\n", p, p+i);
                }
-               bucket->perm_n++;
+               work->perm_n++;
        }
        for (i = 0; i < bucket->size; i++)
-               dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
+               dprintk(" perm_choose  %d: %d\n", i, work->perm[i]);
 
-       s = bucket->perm[pr];
+       s = work->perm[pr];
 out:
        dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
                bucket->size, x, r, pr, s);
@@ -132,14 +132,14 @@ out:
 }
 
 /* uniform */
-static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
-                                int x, int r)
+static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
+                                struct crush_work_bucket *work, int x, int r)
 {
-       return bucket_perm_choose(&bucket->h, x, r);
+       return bucket_perm_choose(&bucket->h, work, x, r);
 }
 
 /* list */
-static int bucket_list_choose(struct crush_bucket_list *bucket,
+static int bucket_list_choose(const struct crush_bucket_list *bucket,
                              int x, int r)
 {
        int i;
@@ -155,8 +155,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
                w *= bucket->sum_weights[i];
                w = w >> 16;
                /*dprintk(" scaled %llx\n", w);*/
-               if (w < bucket->item_weights[i])
+               if (w < bucket->item_weights[i]) {
                        return bucket->h.items[i];
+               }
        }
 
        dprintk("bad list sums for bucket %d\n", bucket->h.id);
@@ -192,7 +193,7 @@ static int terminal(int x)
        return x & 1;
 }
 
-static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
                              int x, int r)
 {
        int n;
@@ -224,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
 
 /* straw */
 
-static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
                               int x, int r)
 {
        __u32 i;
@@ -301,7 +302,7 @@ static __u64 crush_ln(unsigned int xin)
  *
  */
 
-static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
+static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
                                int x, int r)
 {
        unsigned int i, high = 0;
@@ -344,37 +345,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
                        high_draw = draw;
                }
        }
+
        return bucket->h.items[high];
 }
 
 
-static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+static int crush_bucket_choose(const struct crush_bucket *in,
+                              struct crush_work_bucket *work,
+                              int x, int r)
 {
        dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
        BUG_ON(in->size == 0);
        switch (in->alg) {
        case CRUSH_BUCKET_UNIFORM:
-               return bucket_uniform_choose((struct crush_bucket_uniform *)in,
-                                         x, r);
+               return bucket_uniform_choose(
+                       (const struct crush_bucket_uniform *)in,
+                       work, x, r);
        case CRUSH_BUCKET_LIST:
-               return bucket_list_choose((struct crush_bucket_list *)in,
+               return bucket_list_choose((const struct crush_bucket_list *)in,
                                          x, r);
        case CRUSH_BUCKET_TREE:
-               return bucket_tree_choose((struct crush_bucket_tree *)in,
+               return bucket_tree_choose((const struct crush_bucket_tree *)in,
                                          x, r);
        case CRUSH_BUCKET_STRAW:
-               return bucket_straw_choose((struct crush_bucket_straw *)in,
-                                          x, r);
+               return bucket_straw_choose(
+                       (const struct crush_bucket_straw *)in,
+                       x, r);
        case CRUSH_BUCKET_STRAW2:
-               return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
-                                           x, r);
+               return bucket_straw2_choose(
+                       (const struct crush_bucket_straw2 *)in,
+                       x, r);
        default:
                dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
                return in->items[0];
        }
 }
 
-
 /*
  * true if device is marked "out" (failed, fully offloaded)
  * of the cluster
@@ -416,7 +422,8 @@ static int is_out(const struct crush_map *map,
  * @parent_r: r value passed from the parent
  */
 static int crush_choose_firstn(const struct crush_map *map,
-                              struct crush_bucket *bucket,
+                              struct crush_work *work,
+                              const struct crush_bucket *bucket,
                               const __u32 *weight, int weight_max,
                               int x, int numrep, int type,
                               int *out, int outpos,
@@ -434,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map,
        int rep;
        unsigned int ftotal, flocal;
        int retry_descent, retry_bucket, skip_rep;
-       struct crush_bucket *in = bucket;
+       const struct crush_bucket *in = bucket;
        int r;
        int i;
        int item = 0;
@@ -473,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map,
                                if (local_fallback_retries > 0 &&
                                    flocal >= (in->size>>1) &&
                                    flocal > local_fallback_retries)
-                                       item = bucket_perm_choose(in, x, r);
+                                       item = bucket_perm_choose(
+                                               in, work->work[-1-in->id],
+                                               x, r);
                                else
-                                       item = crush_bucket_choose(in, x, r);
+                                       item = crush_bucket_choose(
+                                               in, work->work[-1-in->id],
+                                               x, r);
                                if (item >= map->max_devices) {
                                        dprintk("   bad item %d\n", item);
                                        skip_rep = 1;
@@ -518,19 +529,21 @@ static int crush_choose_firstn(const struct crush_map *map,
                                                        sub_r = r >> (vary_r-1);
                                                else
                                                        sub_r = 0;
-                                               if (crush_choose_firstn(map,
-                                                        map->buckets[-1-item],
-                                                        weight, weight_max,
-                                                        x, stable ? 1 : outpos+1, 0,
-                                                        out2, outpos, count,
-                                                        recurse_tries, 0,
-                                                        local_retries,
-                                                        local_fallback_retries,
-                                                        0,
-                                                        vary_r,
-                                                        stable,
-                                                        NULL,
-                                                        sub_r) <= outpos)
+                                               if (crush_choose_firstn(
+                                                           map,
+                                                           work,
+                                                           map->buckets[-1-item],
+                                                           weight, weight_max,
+                                                           x, stable ? 1 : outpos+1, 0,
+                                                           out2, outpos, count,
+                                                           recurse_tries, 0,
+                                                           local_retries,
+                                                           local_fallback_retries,
+                                                           0,
+                                                           vary_r,
+                                                           stable,
+                                                           NULL,
+                                                           sub_r) <= outpos)
                                                        /* didn't get leaf */
                                                        reject = 1;
                                        } else {
@@ -539,14 +552,12 @@ static int crush_choose_firstn(const struct crush_map *map,
                                        }
                                }
 
-                               if (!reject) {
+                               if (!reject && !collide) {
                                        /* out? */
                                        if (itemtype == 0)
                                                reject = is_out(map, weight,
                                                                weight_max,
                                                                item, x);
-                                       else
-                                               reject = 0;
                                }
 
 reject:
@@ -600,7 +611,8 @@ reject:
  *
  */
 static void crush_choose_indep(const struct crush_map *map,
-                              struct crush_bucket *bucket,
+                              struct crush_work *work,
+                              const struct crush_bucket *bucket,
                               const __u32 *weight, int weight_max,
                               int x, int left, int numrep, int type,
                               int *out, int outpos,
@@ -610,7 +622,7 @@ static void crush_choose_indep(const struct crush_map *map,
                               int *out2,
                               int parent_r)
 {
-       struct crush_bucket *in = bucket;
+       const struct crush_bucket *in = bucket;
        int endpos = outpos + left;
        int rep;
        unsigned int ftotal;
@@ -678,7 +690,9 @@ static void crush_choose_indep(const struct crush_map *map,
                                        break;
                                }
 
-                               item = crush_bucket_choose(in, x, r);
+                               item = crush_bucket_choose(
+                                       in, work->work[-1-in->id],
+                                       x, r);
                                if (item >= map->max_devices) {
                                        dprintk("   bad item %d\n", item);
                                        out[rep] = CRUSH_ITEM_NONE;
@@ -724,13 +738,15 @@ static void crush_choose_indep(const struct crush_map *map,
 
                                if (recurse_to_leaf) {
                                        if (item < 0) {
-                                               crush_choose_indep(map,
-                                                  map->buckets[-1-item],
-                                                  weight, weight_max,
-                                                  x, 1, numrep, 0,
-                                                  out2, rep,
-                                                  recurse_tries, 0,
-                                                  0, NULL, r);
+                                               crush_choose_indep(
+                                                       map,
+                                                       work,
+                                                       map->buckets[-1-item],
+                                                       weight, weight_max,
+                                                       x, 1, numrep, 0,
+                                                       out2, rep,
+                                                       recurse_tries, 0,
+                                                       0, NULL, r);
                                                if (out2[rep] == CRUSH_ITEM_NONE) {
                                                        /* placed nothing; no leaf */
                                                        break;
@@ -781,6 +797,53 @@ static void crush_choose_indep(const struct crush_map *map,
 #endif
 }
 
+
+/*
+ * This takes a chunk of memory and sets it up to be a shiny new
+ * working area for a CRUSH placement computation. It must be called
+ * on any newly allocated memory before passing it in to
+ * crush_do_rule. It may be used repeatedly after that, so long as the
+ * map has not changed. If the map /has/ changed, you must make sure
+ * the working size is no smaller than what was allocated and re-run
+ * crush_init_workspace.
+ *
+ * If you do retain the working space between calls to crush, make it
+ * thread-local.
+ */
+void crush_init_workspace(const struct crush_map *map, void *v)
+{
+       struct crush_work *w = v;
+       __s32 b;
+
+       /*
+        * We work by moving through the available space and setting
+        * values and pointers as we go.
+        *
+        * It's a bit like Forth's use of the 'allot' word since we
+        * set the pointer first and then reserve the space for it to
+        * point to by incrementing the point.
+        */
+       v += sizeof(struct crush_work *);
+       w->work = v;
+       v += map->max_buckets * sizeof(struct crush_work_bucket *);
+       for (b = 0; b < map->max_buckets; ++b) {
+               if (!map->buckets[b])
+                       continue;
+
+               w->work[b] = v;
+               switch (map->buckets[b]->alg) {
+               default:
+                       v += sizeof(struct crush_work_bucket);
+                       break;
+               }
+               w->work[b]->perm_x = 0;
+               w->work[b]->perm_n = 0;
+               w->work[b]->perm = v;
+               v += map->buckets[b]->size * sizeof(__u32);
+       }
+       BUG_ON(v - (void *)w != map->working_size);
+}
+
 /**
  * crush_do_rule - calculate a mapping with the given input and rule
  * @map: the crush_map
@@ -790,24 +853,25 @@ static void crush_choose_indep(const struct crush_map *map,
  * @result_max: maximum result size
  * @weight: weight vector (for map leaves)
  * @weight_max: size of weight vector
- * @scratch: scratch vector for private use; must be >= 3 * result_max
+ * @cwin: pointer to at least crush_work_size() bytes of memory
  */
 int crush_do_rule(const struct crush_map *map,
                  int ruleno, int x, int *result, int result_max,
                  const __u32 *weight, int weight_max,
-                 int *scratch)
+                 void *cwin)
 {
        int result_len;
-       int *a = scratch;
-       int *b = scratch + result_max;
-       int *c = scratch + result_max*2;
+       struct crush_work *cw = cwin;
+       int *a = cwin + map->working_size;
+       int *b = a + result_max;
+       int *c = b + result_max;
+       int *w = a;
+       int *o = b;
        int recurse_to_leaf;
-       int *w;
        int wsize = 0;
-       int *o;
        int osize;
        int *tmp;
-       struct crush_rule *rule;
+       const struct crush_rule *rule;
        __u32 step;
        int i, j;
        int numrep;
@@ -835,12 +899,10 @@ int crush_do_rule(const struct crush_map *map,
 
        rule = map->rules[ruleno];
        result_len = 0;
-       w = a;
-       o = b;
 
        for (step = 0; step < rule->len; step++) {
                int firstn = 0;
-               struct crush_rule_step *curstep = &rule->steps[step];
+               const struct crush_rule_step *curstep = &rule->steps[step];
 
                switch (curstep->op) {
                case CRUSH_RULE_TAKE:
@@ -936,6 +998,7 @@ int crush_do_rule(const struct crush_map *map,
                                                recurse_tries = choose_tries;
                                        osize += crush_choose_firstn(
                                                map,
+                                               cw,
                                                map->buckets[bno],
                                                weight, weight_max,
                                                x, numrep,
@@ -956,6 +1019,7 @@ int crush_do_rule(const struct crush_map *map,
                                                    numrep : (result_max-osize));
                                        crush_choose_indep(
                                                map,
+                                               cw,
                                                map->buckets[bno],
                                                weight, weight_max,
                                                x, out_size, numrep,
@@ -997,5 +1061,6 @@ int crush_do_rule(const struct crush_map *map,
                        break;
                }
        }
+
        return result_len;
 }
index 292e33bd916e650c0317ab630a0c60a400d21c7d..85747b7f91a91894d4902636d5145dc957184df3 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <linux/err.h>
 #include <linux/scatterlist.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <crypto/aes.h>
 #include <crypto/skcipher.h>
index f3378ba1a82893024b9012c5421099bce87f1824..b65bbf9f45ebb22c8ac51af34c6b1c29ef7ed17c 100644 (file)
@@ -460,7 +460,6 @@ static void request_init(struct ceph_osd_request *req)
 
        kref_init(&req->r_kref);
        init_completion(&req->r_completion);
-       init_completion(&req->r_done_completion);
        RB_CLEAR_NODE(&req->r_node);
        RB_CLEAR_NODE(&req->r_mc_node);
        INIT_LIST_HEAD(&req->r_unsafe_item);
@@ -672,7 +671,8 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
        BUG_ON(length > previous);
 
        op->extent.length = length;
-       op->indata_len -= previous - length;
+       if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
+               op->indata_len -= previous - length;
 }
 EXPORT_SYMBOL(osd_req_op_extent_update);
 
@@ -1636,7 +1636,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
        bool need_send = false;
        bool promoted = false;
 
-       WARN_ON(req->r_tid || req->r_got_reply);
+       WARN_ON(req->r_tid);
        dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
 
 again:
@@ -1704,17 +1704,10 @@ promote:
 
 static void account_request(struct ceph_osd_request *req)
 {
-       unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+       WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
+       WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
 
-       if (req->r_flags & CEPH_OSD_FLAG_READ) {
-               WARN_ON(req->r_flags & mask);
-               req->r_flags |= CEPH_OSD_FLAG_ACK;
-       } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
-               WARN_ON(!(req->r_flags & mask));
-       else
-               WARN_ON(1);
-
-       WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+       req->r_flags |= CEPH_OSD_FLAG_ONDISK;
        atomic_inc(&req->r_osdc->num_requests);
 }
 
@@ -1749,15 +1742,15 @@ static void finish_request(struct ceph_osd_request *req)
 
 static void __complete_request(struct ceph_osd_request *req)
 {
-       if (req->r_callback)
+       if (req->r_callback) {
+               dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
+                    req->r_tid, req->r_callback, req->r_result);
                req->r_callback(req);
-       else
-               complete_all(&req->r_completion);
+       }
 }
 
 /*
- * Note that this is open-coded in handle_reply(), which has to deal
- * with ack vs commit, dup acks, etc.
+ * This is open-coded in handle_reply().
  */
 static void complete_request(struct ceph_osd_request *req, int err)
 {
@@ -1766,7 +1759,7 @@ static void complete_request(struct ceph_osd_request *req, int err)
        req->r_result = err;
        finish_request(req);
        __complete_request(req);
-       complete_all(&req->r_done_completion);
+       complete_all(&req->r_completion);
        ceph_osdc_put_request(req);
 }
 
@@ -1792,7 +1785,7 @@ static void cancel_request(struct ceph_osd_request *req)
 
        cancel_map_check(req);
        finish_request(req);
-       complete_all(&req->r_done_completion);
+       complete_all(&req->r_completion);
        ceph_osdc_put_request(req);
 }
 
@@ -2169,7 +2162,6 @@ static void linger_commit_cb(struct ceph_osd_request *req)
        mutex_lock(&lreq->lock);
        dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
             lreq->linger_id, req->r_result);
-       WARN_ON(!__linger_registered(lreq));
        linger_reg_commit_complete(lreq, req->r_result);
        lreq->committed = true;
 
@@ -2785,31 +2777,8 @@ e_inval:
 }
 
 /*
- * We are done with @req if
- *   - @m is a safe reply, or
- *   - @m is an unsafe reply and we didn't want a safe one
- */
-static bool done_request(const struct ceph_osd_request *req,
-                        const struct MOSDOpReply *m)
-{
-       return (m->result < 0 ||
-               (m->flags & CEPH_OSD_FLAG_ONDISK) ||
-               !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
-}
-
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- *
- * ->r_unsafe_callback is set? yes                     no
- *
- * first reply is OK (needed   r_cb/r_completion,      r_cb/r_completion,
- * any or needed/got safe)     r_done_completion       r_done_completion
- *
- * first reply is unsafe       r_unsafe_cb(true)       (nothing)
- *
- * when we get the safe reply  r_unsafe_cb(false),     r_cb/r_completion,
- *                             r_done_completion       r_done_completion
+ * Handle MOSDOpReply.  Set ->r_result and call the callback if it is
+ * specified.
  */
 static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 {
@@ -2818,7 +2787,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
        struct MOSDOpReply m;
        u64 tid = le64_to_cpu(msg->hdr.tid);
        u32 data_len = 0;
-       bool already_acked;
        int ret;
        int i;
 
@@ -2897,50 +2865,22 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
                       le32_to_cpu(msg->hdr.data_len), req->r_tid);
                goto fail_request;
        }
-       dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
-            req, req->r_tid, req->r_got_reply, m.result, data_len);
-
-       already_acked = req->r_got_reply;
-       if (!already_acked) {
-               req->r_result = m.result ?: data_len;
-               req->r_replay_version = m.replay_version; /* struct */
-               req->r_got_reply = true;
-       } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
-               dout("req %p tid %llu dup ack\n", req, req->r_tid);
-               goto out_unlock_session;
-       }
-
-       if (done_request(req, &m)) {
-               finish_request(req);
-               if (req->r_linger) {
-                       WARN_ON(req->r_unsafe_callback);
-                       dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
-                       __complete_request(req);
-               }
-       }
+       dout("%s req %p tid %llu result %d data_len %u\n", __func__,
+            req, req->r_tid, m.result, data_len);
 
+       /*
+        * Since we only ever request ONDISK, we should only ever get
+        * one (type of) reply back.
+        */
+       WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
+       req->r_result = m.result ?: data_len;
+       finish_request(req);
        mutex_unlock(&osd->lock);
        up_read(&osdc->lock);
 
-       if (done_request(req, &m)) {
-               if (already_acked && req->r_unsafe_callback) {
-                       dout("req %p tid %llu safe-cb\n", req, req->r_tid);
-                       req->r_unsafe_callback(req, false);
-               } else if (!req->r_linger) {
-                       dout("req %p tid %llu cb\n", req, req->r_tid);
-                       __complete_request(req);
-               }
-               complete_all(&req->r_done_completion);
-               ceph_osdc_put_request(req);
-       } else {
-               if (req->r_unsafe_callback) {
-                       dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
-                       req->r_unsafe_callback(req, true);
-               } else {
-                       WARN_ON(1);
-               }
-       }
-
+       __complete_request(req);
+       complete_all(&req->r_completion);
+       ceph_osdc_put_request(req);
        return;
 
 fail_request:
@@ -3540,7 +3480,7 @@ again:
                        up_read(&osdc->lock);
                        dout("%s waiting on req %p tid %llu last_tid %llu\n",
                             __func__, req, req->r_tid, last_tid);
-                       wait_for_completion(&req->r_done_completion);
+                       wait_for_completion(&req->r_completion);
                        ceph_osdc_put_request(req);
                        goto again;
                }
@@ -3599,7 +3539,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
 
        ceph_oid_copy(&lreq->t.base_oid, oid);
        ceph_oloc_copy(&lreq->t.base_oloc, oloc);
-       lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+       lreq->t.flags = CEPH_OSD_FLAG_WRITE;
        lreq->mtime = CURRENT_TIME;
 
        lreq->reg_req = alloc_linger_request(lreq);
@@ -3657,7 +3597,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
 
        ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
        ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
-       req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+       req->r_flags = CEPH_OSD_FLAG_WRITE;
        req->r_mtime = CURRENT_TIME;
        osd_req_op_watch_init(req, 0, lreq->linger_id,
                              CEPH_OSD_WATCH_OP_UNWATCH);
@@ -4022,7 +3962,7 @@ EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
  * Execute an OSD class method on an object.
  *
  * @flags: CEPH_OSD_FLAG_*
- * @resp_len: out param for reply length
+ * @resp_len: in/out param for reply length
  */
 int ceph_osdc_call(struct ceph_osd_client *osdc,
                   struct ceph_object_id *oid,
@@ -4035,6 +3975,9 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
        struct ceph_osd_request *req;
        int ret;
 
+       if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE))
+               return -E2BIG;
+
        req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
        if (!req)
                return -ENOMEM;
@@ -4053,7 +3996,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
                                                  0, false, false);
        if (resp_page)
                osd_req_op_cls_response_data_pages(req, 0, &resp_page,
-                                                  PAGE_SIZE, 0, false, false);
+                                                  *resp_len, 0, false, false);
 
        ceph_osdc_start_request(osdc, req, false);
        ret = ceph_osdc_wait_request(osdc, req);
@@ -4220,8 +4163,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
        int page_align = off & ~PAGE_MASK;
 
        req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
-                                   CEPH_OSD_OP_WRITE,
-                                   CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+                                   CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
                                    snapc, truncate_seq, truncate_size,
                                    true);
        if (IS_ERR(req))
index d2436880b3056da8342845103c23aa59c66f8066..6824c0ec8373e721ac9ca2d837f488ff22233e1f 100644 (file)
@@ -153,6 +153,32 @@ bad:
         return -EINVAL;
 }
 
+static void crush_finalize(struct crush_map *c)
+{
+       __s32 b;
+
+       /* Space for the array of pointers to per-bucket workspace */
+       c->working_size = sizeof(struct crush_work) +
+           c->max_buckets * sizeof(struct crush_work_bucket *);
+
+       for (b = 0; b < c->max_buckets; b++) {
+               if (!c->buckets[b])
+                       continue;
+
+               switch (c->buckets[b]->alg) {
+               default:
+                       /*
+                        * The base case, permutation variables and
+                        * the pointer to the permutation array.
+                        */
+                       c->working_size += sizeof(struct crush_work_bucket);
+                       break;
+               }
+               /* Every bucket has a permutation array. */
+               c->working_size += c->buckets[b]->size * sizeof(__u32);
+       }
+}
+
 static struct crush_map *crush_decode(void *pbyval, void *end)
 {
        struct crush_map *c;
@@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
                b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
                if (b->items == NULL)
                        goto badmem;
-               b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
-               if (b->perm == NULL)
-                       goto badmem;
-               b->perm_n = 0;
 
                ceph_decode_need(p, end, b->size*sizeof(u32), bad);
                for (j = 0; j < b->size; j++)
@@ -368,6 +390,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
        dout("crush decode tunable chooseleaf_stable = %d\n",
             c->chooseleaf_stable);
 
+       crush_finalize(c);
+
 done:
        dout("crush_decode success\n");
        return c;
@@ -719,7 +743,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
        map->pool_max = -1;
        map->pg_temp = RB_ROOT;
        map->primary_temp = RB_ROOT;
-       mutex_init(&map->crush_scratch_mutex);
+       mutex_init(&map->crush_workspace_mutex);
 
        return map;
 }
@@ -753,6 +777,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
        kfree(map->osd_weight);
        kfree(map->osd_addr);
        kfree(map->osd_primary_affinity);
+       kfree(map->crush_workspace);
        kfree(map);
 }
 
@@ -808,6 +833,31 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
        return 0;
 }
 
+static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
+{
+       void *workspace;
+       size_t work_size;
+
+       if (IS_ERR(crush))
+               return PTR_ERR(crush);
+
+       work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
+       dout("%s work_size %zu bytes\n", __func__, work_size);
+       workspace = kmalloc(work_size, GFP_NOIO);
+       if (!workspace) {
+               crush_destroy(crush);
+               return -ENOMEM;
+       }
+       crush_init_workspace(crush, workspace);
+
+       if (map->crush)
+               crush_destroy(map->crush);
+       kfree(map->crush_workspace);
+       map->crush = crush;
+       map->crush_workspace = workspace;
+       return 0;
+}
+
 #define OSDMAP_WRAPPER_COMPAT_VER      7
 #define OSDMAP_CLIENT_DATA_COMPAT_VER  1
 
@@ -1214,13 +1264,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 
        /* crush */
        ceph_decode_32_safe(p, end, len, e_inval);
-       map->crush = crush_decode(*p, min(*p + len, end));
-       if (IS_ERR(map->crush)) {
-               err = PTR_ERR(map->crush);
-               map->crush = NULL;
+       err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end)));
+       if (err)
                goto bad;
-       }
-       *p += len;
 
        /* ignore the rest */
        *p = end;
@@ -1375,7 +1421,6 @@ e_inval:
 struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                                             struct ceph_osdmap *map)
 {
-       struct crush_map *newcrush = NULL;
        struct ceph_fsid fsid;
        u32 epoch = 0;
        struct ceph_timespec modified;
@@ -1414,12 +1459,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        /* new crush? */
        ceph_decode_32_safe(p, end, len, e_inval);
        if (len > 0) {
-               newcrush = crush_decode(*p, min(*p+len, end));
-               if (IS_ERR(newcrush)) {
-                       err = PTR_ERR(newcrush);
-                       newcrush = NULL;
+               err = osdmap_set_crush(map,
+                                      crush_decode(*p, min(*p + len, end)));
+               if (err)
                        goto bad;
-               }
                *p += len;
        }
 
@@ -1439,12 +1482,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 
        map->epoch++;
        map->modified = modified;
-       if (newcrush) {
-               if (map->crush)
-                       crush_destroy(map->crush);
-               map->crush = newcrush;
-               newcrush = NULL;
-       }
 
        /* new_pools */
        err = decode_new_pools(p, end, map);
@@ -1505,8 +1542,6 @@ bad:
        print_hex_dump(KERN_DEBUG, "osdmap: ",
                       DUMP_PREFIX_OFFSET, 16, 1,
                       start, end - start, true);
-       if (newcrush)
-               crush_destroy(newcrush);
        return ERR_PTR(err);
 }
 
@@ -1942,10 +1977,10 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 
        BUG_ON(result_max > CEPH_PG_MAX_SIZE);
 
-       mutex_lock(&map->crush_scratch_mutex);
+       mutex_lock(&map->crush_workspace_mutex);
        r = crush_do_rule(map->crush, ruleno, x, result, result_max,
-                         weight, weight_max, map->crush_scratch_ary);
-       mutex_unlock(&map->crush_scratch_mutex);
+                         weight, weight_max, map->crush_workspace);
+       mutex_unlock(&map->crush_workspace_mutex);
 
        return r;
 }
@@ -1978,8 +2013,14 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
                return;
        }
 
-       len = do_crush(osdmap, ruleno, pps, raw->osds,
-                      min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
+       if (pi->size > ARRAY_SIZE(raw->osds)) {
+               pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n",
+                      pi->id, pi->crush_ruleset, pi->type, pi->size,
+                      ARRAY_SIZE(raw->osds));
+               return;
+       }
+
+       len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
                       osdmap->osd_weight, osdmap->max_osd);
        if (len < 0) {
                pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
index 154683f5f14cdc6e99fa01a4e79e22e69924eef4..705414e78ae0b05d2d1b8d5d8f8e8fbb6007bfb4 100644 (file)
@@ -18,8 +18,6 @@
  * 02110-1301, USA.
  */
 
-#include <stddef.h>
-
 #include <linux/types.h>
 #include <linux/export.h>
 #include <linux/ceph/libceph.h>