fs/ceph/caps.c

   1 #include "ceph_debug.h"
   2
   3 #include <linux/fs.h>
   4 #include <linux/kernel.h>
   5 #include <linux/sched.h>
   6 #include <linux/vmalloc.h>
   7 #include <linux/wait.h>
   8 #include <linux/writeback.h>
   9
  10 #include "super.h"
  11 #include "decode.h"
  12 #include "messenger.h"
  13
  14 /*
  15  * Capability management
  16  *
  17  * The Ceph metadata servers control client access to inode metadata
  18  * and file data by issuing capabilities, granting clients permission
  19  * to read and/or write both inode field and file data to OSDs
  20  * (storage nodes).  Each capability consists of a set of bits
  21  * indicating which operations are allowed.
  22  *
  23  * If the client holds a *_SHARED cap, the client has a coherent value
  24  * that can be safely read from the cached inode.
  25  *
  26  * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
  27  * client is allowed to change inode attributes (e.g., file size,
  28  * mtime), note its dirty state in the ceph_cap, and asynchronously
  29  * flush that metadata change to the MDS.
  30  *
  31  * In the event of a conflicting operation (perhaps by another
  32  * client), the MDS will revoke the conflicting client capabilities.
  33  *
  34  * In order for a client to cache an inode, it must hold a capability
  35  * with at least one MDS server.  When inodes are released, release
  36  * notifications are batched and periodically sent en masse to the MDS
  37  * cluster to release server state.
  38  */
  39
  40
  41 /*
  42  * Generate readable cap strings for debugging output.
  43  */
  44 #define MAX_CAP_STR 20
  45 static char cap_str[MAX_CAP_STR][40];
  46 static DEFINE_SPINLOCK(cap_str_lock);
  47 static int last_cap_str;
  48
  49 static char *gcap_string(char *s, int c)
  50 {
  51         if (c & CEPH_CAP_GSHARED)
  52                 *s++ = 's';
  53         if (c & CEPH_CAP_GEXCL)
  54                 *s++ = 'x';
  55         if (c & CEPH_CAP_GCACHE)
  56                 *s++ = 'c';
  57         if (c & CEPH_CAP_GRD)
  58                 *s++ = 'r';
  59         if (c & CEPH_CAP_GWR)
  60                 *s++ = 'w';
  61         if (c & CEPH_CAP_GBUFFER)
  62                 *s++ = 'b';
  63         if (c & CEPH_CAP_GLAZYIO)
  64                 *s++ = 'l';
  65         return s;
  66 }
  67
  68 const char *ceph_cap_string(int caps)
  69 {
  70         int i;
  71         char *s;
  72         int c;
  73
  74         spin_lock(&cap_str_lock);
  75         i = last_cap_str++;
  76         if (last_cap_str == MAX_CAP_STR)
  77                 last_cap_str = 0;
  78         spin_unlock(&cap_str_lock);
  79
  80         s = cap_str[i];
  81
  82         if (caps & CEPH_CAP_PIN)
  83                 *s++ = 'p';
  84
  85         c = (caps >> CEPH_CAP_SAUTH) & 3;
  86         if (c) {
  87                 *s++ = 'A';
  88                 s = gcap_string(s, c);
  89         }
  90
  91         c = (caps >> CEPH_CAP_SLINK) & 3;
  92         if (c) {
  93                 *s++ = 'L';
  94                 s = gcap_string(s, c);
  95         }
  96
  97         c = (caps >> CEPH_CAP_SXATTR) & 3;
  98         if (c) {
  99                 *s++ = 'X';
 100                 s = gcap_string(s, c);
 101         }
 102
 103         c = caps >> CEPH_CAP_SFILE;
 104         if (c) {
 105                 *s++ = 'F';
 106                 s = gcap_string(s, c);
 107         }
 108
 109         if (s == cap_str[i])
 110                 *s++ = '-';
 111         *s = 0;
 112         return cap_str[i];
 113 }
 114
 115 /*
 116  * Cap reservations
 117  *
 118  * Maintain a global pool of preallocated struct ceph_caps, referenced
 119  * by struct ceph_caps_reservations.  This ensures that we preallocate
 120  * memory needed to successfully process an MDS response.  (If an MDS
 121  * sends us cap information and we fail to process it, we will have
 122  * problems due to the client and MDS being out of sync.)
 123  *
 124  * Reservations are 'owned' by a ceph_cap_reservation context.
 125  */
 126 static spinlock_t caps_list_lock;
 127 static struct list_head caps_list;  /* unused (reserved or unreserved) */
 128 static int caps_total_count;        /* total caps allocated */
 129 static int caps_use_count;          /* in use */
 130 static int caps_reserve_count;      /* unused, reserved */
 131 static int caps_avail_count;        /* unused, unreserved */
 132 static int caps_min_count;          /* keep at least this many (unreserved) */
 133
 134 void __init ceph_caps_init(void)
 135 {
 136         INIT_LIST_HEAD(&caps_list);
 137         spin_lock_init(&caps_list_lock);
 138 }
 139
 140 void ceph_caps_finalize(void)
 141 {
 142         struct ceph_cap *cap;
 143
 144         spin_lock(&caps_list_lock);
 145         while (!list_empty(&caps_list)) {
 146                 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
 147                 list_del(&cap->caps_item);
 148                 kmem_cache_free(ceph_cap_cachep, cap);
 149         }
 150         caps_total_count = 0;
 151         caps_avail_count = 0;
 152         caps_use_count = 0;
 153         caps_reserve_count = 0;
 154         caps_min_count = 0;
 155         spin_unlock(&caps_list_lock);
 156 }
 157
 158 void ceph_adjust_min_caps(int delta)
 159 {
 160         spin_lock(&caps_list_lock);
 161         caps_min_count += delta;
 162         BUG_ON(caps_min_count < 0);
 163         spin_unlock(&caps_list_lock);
 164 }
 165
 166 int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
 167 {
 168         int i;
 169         struct ceph_cap *cap;
 170         int have;
 171         int alloc = 0;
 172         LIST_HEAD(newcaps);
 173         int ret = 0;
 174
 175         dout("reserve caps ctx=%p need=%d\n", ctx, need);
 176
 177         /* first reserve any caps that are already allocated */
 178         spin_lock(&caps_list_lock);
 179         if (caps_avail_count >= need)
 180                 have = need;
 181         else
 182                 have = caps_avail_count;
 183         caps_avail_count -= have;
 184         caps_reserve_count += have;
 185         BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
 186                caps_avail_count);
 187         spin_unlock(&caps_list_lock);
 188
 189         for (i = have; i < need; i++) {
 190                 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
 191                 if (!cap) {
 192                         ret = -ENOMEM;
 193                         goto out_alloc_count;
 194                 }
 195                 list_add(&cap->caps_item, &newcaps);
 196                 alloc++;
 197         }
 198         BUG_ON(have + alloc != need);
 199
 200         spin_lock(&caps_list_lock);
 201         caps_total_count += alloc;
 202         caps_reserve_count += alloc;
 203         list_splice(&newcaps, &caps_list);
 204
 205         BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
 206                caps_avail_count);
 207         spin_unlock(&caps_list_lock);
 208
 209         ctx->count = need;
 210         dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
 211              ctx, caps_total_count, caps_use_count, caps_reserve_count,
 212              caps_avail_count);
 213         return 0;
 214
 215 out_alloc_count:
 216         /* we didn't manage to reserve as much as we needed */
 217         pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
 218                    ctx, need, have);
 219         return ret;
 220 }
 221
 222 int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
 223 {
 224         dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
 225         if (ctx->count) {
 226                 spin_lock(&caps_list_lock);
 227                 BUG_ON(caps_reserve_count < ctx->count);
 228                 caps_reserve_count -= ctx->count;
 229                 caps_avail_count += ctx->count;
 230                 ctx->count = 0;
 231                 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
 232                      caps_total_count, caps_use_count, caps_reserve_count,
 233                      caps_avail_count);
 234                 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
 235                        caps_avail_count);
 236                 spin_unlock(&caps_list_lock);
 237         }
 238         return 0;
 239 }
 240
 241 static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
 242 {
 243         struct ceph_cap *cap = NULL;
 244
 245         /* temporary, until we do something about cap import/export */
 246         if (!ctx)
 247                 return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
 248
 249         spin_lock(&caps_list_lock);
 250         dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
 251              ctx, ctx->count, caps_total_count, caps_use_count,
 252              caps_reserve_count, caps_avail_count);
 253         BUG_ON(!ctx->count);
 254         BUG_ON(ctx->count > caps_reserve_count);
 255         BUG_ON(list_empty(&caps_list));
 256
 257         ctx->count--;
 258         caps_reserve_count--;
 259         caps_use_count++;
 260
 261         cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
 262         list_del(&cap->caps_item);
 263
 264         BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
 265                caps_avail_count);
 266         spin_unlock(&caps_list_lock);
 267         return cap;
 268 }
 269
 270 void ceph_put_cap(struct ceph_cap *cap)
 271 {
 272         spin_lock(&caps_list_lock);
 273         dout("put_cap %p %d = %d used + %d resv + %d avail\n",
 274              cap, caps_total_count, caps_use_count,
 275              caps_reserve_count, caps_avail_count);
 276         caps_use_count--;
 277         /*
 278          * Keep some preallocated caps around (ceph_min_count), to
 279          * avoid lots of free/alloc churn.
 280          */
 281         if (caps_avail_count >= caps_reserve_count + caps_min_count) {
 282                 caps_total_count--;
 283                 kmem_cache_free(ceph_cap_cachep, cap);
 284         } else {
 285                 caps_avail_count++;
 286                 list_add(&cap->caps_item, &caps_list);
 287         }
 288
 289         BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
 290                caps_avail_count);
 291         spin_unlock(&caps_list_lock);
 292 }
 293
 294 void ceph_reservation_status(struct ceph_client *client,
 295                              int *total, int *avail, int *used, int *reserved,
 296                              int *min)
 297 {
 298         if (total)
 299                 *total = caps_total_count;
 300         if (avail)
 301                 *avail = caps_avail_count;
 302         if (used)
 303                 *used = caps_use_count;
 304         if (reserved)
 305                 *reserved = caps_reserve_count;
 306         if (min)
 307                 *min = caps_min_count;
 308 }
 309
 310 /*
 311  * Find ceph_cap for given mds, if any.
 312  *
 313  * Called with i_lock held.
 314  */
 315 static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
 316 {
 317         struct ceph_cap *cap;
 318         struct rb_node *n = ci->i_caps.rb_node;
 319
 320         while (n) {
 321                 cap = rb_entry(n, struct ceph_cap, ci_node);
 322                 if (mds < cap->mds)
 323                         n = n->rb_left;
 324                 else if (mds > cap->mds)
 325                         n = n->rb_right;
 326                 else
 327                         return cap;
 328         }
 329         return NULL;
 330 }
 331
 332 /*
 333  * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
 334  * -1.
 335  */
 336 static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
 337 {
 338         struct ceph_cap *cap;
 339         int mds = -1;
 340         struct rb_node *p;
 341
 342         /* prefer mds with WR|WRBUFFER|EXCL caps */
 343         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 344                 cap = rb_entry(p, struct ceph_cap, ci_node);
 345                 mds = cap->mds;
 346                 if (mseq)
 347                         *mseq = cap->mseq;
 348                 if (cap->issued & (CEPH_CAP_FILE_WR |
 349                                    CEPH_CAP_FILE_BUFFER |
 350                                    CEPH_CAP_FILE_EXCL))
 351                         break;
 352         }
 353         return mds;
 354 }
 355
 356 int ceph_get_cap_mds(struct inode *inode)
 357 {
 358         int mds;
 359         spin_lock(&inode->i_lock);
 360         mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
 361         spin_unlock(&inode->i_lock);
 362         return mds;
 363 }
 364
 365 /*
 366  * Called under i_lock.
 367  */
 368 static void __insert_cap_node(struct ceph_inode_info *ci,
 369                               struct ceph_cap *new)
 370 {
 371         struct rb_node **p = &ci->i_caps.rb_node;
 372         struct rb_node *parent = NULL;
 373         struct ceph_cap *cap = NULL;
 374
 375         while (*p) {
 376                 parent = *p;
 377                 cap = rb_entry(parent, struct ceph_cap, ci_node);
 378                 if (new->mds < cap->mds)
 379                         p = &(*p)->rb_left;
 380                 else if (new->mds > cap->mds)
 381                         p = &(*p)->rb_right;
 382                 else
 383                         BUG();
 384         }
 385
 386         rb_link_node(&new->ci_node, parent, p);
 387         rb_insert_color(&new->ci_node, &ci->i_caps);
 388 }
 389
 390 /*
 391  * (re)set cap hold timeouts, which control the delayed release
 392  * of unused caps back to the MDS.  Should be called on cap use.
 393  */
 394 static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
 395                                struct ceph_inode_info *ci)
 396 {
 397         struct ceph_mount_args *ma = mdsc->client->mount_args;
 398
 399         ci->i_hold_caps_min = round_jiffies(jiffies +
 400                                             ma->caps_wanted_delay_min * HZ);
 401         ci->i_hold_caps_max = round_jiffies(jiffies +
 402                                             ma->caps_wanted_delay_max * HZ);
 403         dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
 404              ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
 405 }
 406
 407 /*
 408  * (Re)queue cap at the end of the delayed cap release list.
 409  *
 410  * If I_FLUSH is set, leave the inode at the front of the list.
 411  *
 412  * Caller holds i_lock
 413  *    -> we take mdsc->cap_delay_lock
 414  */
 415 static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
 416                                 struct ceph_inode_info *ci)
 417 {
 418         __cap_set_timeouts(mdsc, ci);
 419         dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
 420              ci->i_ceph_flags, ci->i_hold_caps_max);
 421         if (!mdsc->stopping) {
 422                 spin_lock(&mdsc->cap_delay_lock);
 423                 if (!list_empty(&ci->i_cap_delay_list)) {
 424                         if (ci->i_ceph_flags & CEPH_I_FLUSH)
 425                                 goto no_change;
 426                         list_del_init(&ci->i_cap_delay_list);
 427                 }
 428                 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
 429 no_change:
 430                 spin_unlock(&mdsc->cap_delay_lock);
 431         }
 432 }
 433
 434 /*
 435  * Queue an inode for immediate writeback.  Mark inode with I_FLUSH,
 436  * indicating we should send a cap message to flush dirty metadata
 437  * asap, and move to the front of the delayed cap list.
 438  */
 439 static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
 440                                       struct ceph_inode_info *ci)
 441 {
 442         dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
 443         spin_lock(&mdsc->cap_delay_lock);
 444         ci->i_ceph_flags |= CEPH_I_FLUSH;
 445         if (!list_empty(&ci->i_cap_delay_list))
 446                 list_del_init(&ci->i_cap_delay_list);
 447         list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
 448         spin_unlock(&mdsc->cap_delay_lock);
 449 }
 450
 451 /*
 452  * Cancel delayed work on cap.
 453  *
 454  * Caller must hold i_lock.
 455  */
 456 static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
 457                                struct ceph_inode_info *ci)
 458 {
 459         dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
 460         if (list_empty(&ci->i_cap_delay_list))
 461                 return;
 462         spin_lock(&mdsc->cap_delay_lock);
 463         list_del_init(&ci->i_cap_delay_list);
 464         spin_unlock(&mdsc->cap_delay_lock);
 465 }
 466
 467 /*
 468  * Common issue checks for add_cap, handle_cap_grant.
 469  */
 470 static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
 471                               unsigned issued)
 472 {
 473         unsigned had = __ceph_caps_issued(ci, NULL);
 474
 475         /*
 476          * Each time we receive FILE_CACHE anew, we increment
 477          * i_rdcache_gen.
 478          */
 479         if ((issued & CEPH_CAP_FILE_CACHE) &&
 480             (had & CEPH_CAP_FILE_CACHE) == 0)
 481                 ci->i_rdcache_gen++;
 482
 483         /*
 484          * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
 485          * don't know what happened to this directory while we didn't
 486          * have the cap.
 487          */
 488         if ((issued & CEPH_CAP_FILE_SHARED) &&
 489             (had & CEPH_CAP_FILE_SHARED) == 0) {
 490                 ci->i_shared_gen++;
 491                 if (S_ISDIR(ci->vfs_inode.i_mode)) {
 492                         dout(" marking %p NOT complete\n", &ci->vfs_inode);
 493                         ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
 494                 }
 495         }
 496 }
 497
 498 /*
 499  * Add a capability under the given MDS session.
 500  *
 501  * Caller should hold session snap_rwsem (read) and s_mutex.
 502  *
 503  * @fmode is the open file mode, if we are opening a file, otherwise
 504  * it is < 0.  (This is so we can atomically add the cap and add an
 505  * open file reference to it.)
 506  */
 507 int ceph_add_cap(struct inode *inode,
 508                  struct ceph_mds_session *session, u64 cap_id,
 509                  int fmode, unsigned issued, unsigned wanted,
 510                  unsigned seq, unsigned mseq, u64 realmino, int flags,
 511                  struct ceph_cap_reservation *caps_reservation)
 512 {
 513         struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
 514         struct ceph_inode_info *ci = ceph_inode(inode);
 515         struct ceph_cap *new_cap = NULL;
 516         struct ceph_cap *cap;
 517         int mds = session->s_mds;
 518         int actual_wanted;
 519
 520         dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
 521              session->s_mds, cap_id, ceph_cap_string(issued), seq);
 522
 523         /*
 524          * If we are opening the file, include file mode wanted bits
 525          * in wanted.
 526          */
 527         if (fmode >= 0)
 528                 wanted |= ceph_caps_for_mode(fmode);
 529
 530 retry:
 531         spin_lock(&inode->i_lock);
 532         cap = __get_cap_for_mds(ci, mds);
 533         if (!cap) {
 534                 if (new_cap) {
 535                         cap = new_cap;
 536                         new_cap = NULL;
 537                 } else {
 538                         spin_unlock(&inode->i_lock);
 539                         new_cap = get_cap(caps_reservation);
 540                         if (new_cap == NULL)
 541                                 return -ENOMEM;
 542                         goto retry;
 543                 }
 544
 545                 cap->issued = 0;
 546                 cap->implemented = 0;
 547                 cap->mds = mds;
 548                 cap->mds_wanted = 0;
 549
 550                 cap->ci = ci;
 551                 __insert_cap_node(ci, cap);
 552
 553                 /* clear out old exporting info?  (i.e. on cap import) */
 554                 if (ci->i_cap_exporting_mds == mds) {
 555                         ci->i_cap_exporting_issued = 0;
 556                         ci->i_cap_exporting_mseq = 0;
 557                         ci->i_cap_exporting_mds = -1;
 558                 }
 559
 560                 /* add to session cap list */
 561                 cap->session = session;
 562                 spin_lock(&session->s_cap_lock);
 563                 list_add_tail(&cap->session_caps, &session->s_caps);
 564                 session->s_nr_caps++;
 565                 spin_unlock(&session->s_cap_lock);
 566         }
 567
 568         if (!ci->i_snap_realm) {
 569                 /*
 570                  * add this inode to the appropriate snap realm
 571                  */
 572                 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
 573                                                                realmino);
 574                 if (realm) {
 575                         ceph_get_snap_realm(mdsc, realm);
 576                         spin_lock(&realm->inodes_with_caps_lock);
 577                         ci->i_snap_realm = realm;
 578                         list_add(&ci->i_snap_realm_item,
 579                                  &realm->inodes_with_caps);
 580                         spin_unlock(&realm->inodes_with_caps_lock);
 581                 } else {
 582                         pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
 583                                realmino);
 584                 }
 585         }
 586
 587         __check_cap_issue(ci, cap, issued);
 588
 589         /*
 590          * If we are issued caps we don't want, or the mds' wanted
 591          * value appears to be off, queue a check so we'll release
 592          * later and/or update the mds wanted value.
 593          */
 594         actual_wanted = __ceph_caps_wanted(ci);
 595         if ((wanted & ~actual_wanted) ||
 596             (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
 597                 dout(" issued %s, mds wanted %s, actual %s, queueing\n",
 598                      ceph_cap_string(issued), ceph_cap_string(wanted),
 599                      ceph_cap_string(actual_wanted));
 600                 __cap_delay_requeue(mdsc, ci);
 601         }
 602
 603         if (flags & CEPH_CAP_FLAG_AUTH)
 604                 ci->i_auth_cap = cap;
 605         else if (ci->i_auth_cap == cap)
 606                 ci->i_auth_cap = NULL;
 607
 608         dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
 609              inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
 610              ceph_cap_string(issued|cap->issued), seq, mds);
 611         cap->cap_id = cap_id;
 612         cap->issued = issued;
 613         cap->implemented |= issued;
 614         cap->mds_wanted |= wanted;
 615         cap->seq = seq;
 616         cap->issue_seq = seq;
 617         cap->mseq = mseq;
 618         cap->cap_gen = session->s_cap_gen;
 619
 620         if (fmode >= 0)
 621                 __ceph_get_fmode(ci, fmode);
 622         spin_unlock(&inode->i_lock);
 623         wake_up(&ci->i_cap_wq);
 624         return 0;
 625 }
 626
 627 /*
 628  * Return true if cap has not timed out and belongs to the current
 629  * generation of the MDS session (i.e. has not gone 'stale' due to
 630  * us losing touch with the mds).
 631  */
 632 static int __cap_is_valid(struct ceph_cap *cap)
 633 {
 634         unsigned long ttl;
 635         u32 gen;
 636
 637         spin_lock(&cap->session->s_cap_lock);
 638         gen = cap->session->s_cap_gen;
 639         ttl = cap->session->s_cap_ttl;
 640         spin_unlock(&cap->session->s_cap_lock);
 641
 642         if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
 643                 dout("__cap_is_valid %p cap %p issued %s "
 644                      "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
 645                      cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
 646                 return 0;
 647         }
 648
 649         return 1;
 650 }
 651
 652 /*
 653  * Return set of valid cap bits issued to us.  Note that caps time
 654  * out, and may be invalidated in bulk if the client session times out
 655  * and session->s_cap_gen is bumped.
 656  */
 657 int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
 658 {
 659         int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
 660         struct ceph_cap *cap;
 661         struct rb_node *p;
 662
 663         if (implemented)
 664                 *implemented = 0;
 665         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 666                 cap = rb_entry(p, struct ceph_cap, ci_node);
 667                 if (!__cap_is_valid(cap))
 668                         continue;
 669                 dout("__ceph_caps_issued %p cap %p issued %s\n",
 670                      &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
 671                 have |= cap->issued;
 672                 if (implemented)
 673                         *implemented |= cap->implemented;
 674         }
 675         return have;
 676 }
 677
 678 /*
 679  * Get cap bits issued by caps other than @ocap
 680  */
 681 int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
 682 {
 683         int have = ci->i_snap_caps;
 684         struct ceph_cap *cap;
 685         struct rb_node *p;
 686
 687         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 688                 cap = rb_entry(p, struct ceph_cap, ci_node);
 689                 if (cap == ocap)
 690                         continue;
 691                 if (!__cap_is_valid(cap))
 692                         continue;
 693                 have |= cap->issued;
 694         }
 695         return have;
 696 }
 697
 698 /*
 699  * Move a cap to the end of the LRU (oldest caps at list head, newest
 700  * at list tail).
 701  */
 702 static void __touch_cap(struct ceph_cap *cap)
 703 {
 704         struct ceph_mds_session *s = cap->session;
 705
 706         spin_lock(&s->s_cap_lock);
 707         if (s->s_cap_iterator == NULL) {
 708                 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
 709                      s->s_mds);
 710                 list_move_tail(&cap->session_caps, &s->s_caps);
 711         } else {
 712                 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
 713                      &cap->ci->vfs_inode, cap, s->s_mds);
 714         }
 715         spin_unlock(&s->s_cap_lock);
 716 }
 717
 718 /*
 719  * Check if we hold the given mask.  If so, move the cap(s) to the
 720  * front of their respective LRUs.  (This is the preferred way for
 721  * callers to check for caps they want.)
 722  */
 723 int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 724 {
 725         struct ceph_cap *cap;
 726         struct rb_node *p;
 727         int have = ci->i_snap_caps;
 728
 729         if ((have & mask) == mask) {
 730                 dout("__ceph_caps_issued_mask %p snap issued %s"
 731                      " (mask %s)\n", &ci->vfs_inode,
 732                      ceph_cap_string(have),
 733                      ceph_cap_string(mask));
 734                 return 1;
 735         }
 736
 737         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 738                 cap = rb_entry(p, struct ceph_cap, ci_node);
 739                 if (!__cap_is_valid(cap))
 740                         continue;
 741                 if ((cap->issued & mask) == mask) {
 742                         dout("__ceph_caps_issued_mask %p cap %p issued %s"
 743                              " (mask %s)\n", &ci->vfs_inode, cap,
 744                              ceph_cap_string(cap->issued),
 745                              ceph_cap_string(mask));
 746                         if (touch)
 747                                 __touch_cap(cap);
 748                         return 1;
 749                 }
 750
 751                 /* does a combination of caps satisfy mask? */
 752                 have |= cap->issued;
 753                 if ((have & mask) == mask) {
 754                         dout("__ceph_caps_issued_mask %p combo issued %s"
 755                              " (mask %s)\n", &ci->vfs_inode,
 756                              ceph_cap_string(cap->issued),
 757                              ceph_cap_string(mask));
 758                         if (touch) {
 759                                 struct rb_node *q;
 760
 761                                 /* touch this + preceeding caps */
 762                                 __touch_cap(cap);
 763                                 for (q = rb_first(&ci->i_caps); q != p;
 764                                      q = rb_next(q)) {
 765                                         cap = rb_entry(q, struct ceph_cap,
 766                                                        ci_node);
 767                                         if (!__cap_is_valid(cap))
 768                                                 continue;
 769                                         __touch_cap(cap);
 770                                 }
 771                         }
 772                         return 1;
 773                 }
 774         }
 775
 776         return 0;
 777 }
 778
 779 /*
 780  * Return true if mask caps are currently being revoked by an MDS.
 781  */
 782 int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
 783 {
 784         struct inode *inode = &ci->vfs_inode;
 785         struct ceph_cap *cap;
 786         struct rb_node *p;
 787         int ret = 0;
 788
 789         spin_lock(&inode->i_lock);
 790         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 791                 cap = rb_entry(p, struct ceph_cap, ci_node);
 792                 if (__cap_is_valid(cap) &&
 793                     (cap->implemented & ~cap->issued & mask)) {
 794                         ret = 1;
 795                         break;
 796                 }
 797         }
 798         spin_unlock(&inode->i_lock);
 799         dout("ceph_caps_revoking %p %s = %d\n", inode,
 800              ceph_cap_string(mask), ret);
 801         return ret;
 802 }
 803
 804 int __ceph_caps_used(struct ceph_inode_info *ci)
 805 {
 806         int used = 0;
 807         if (ci->i_pin_ref)
 808                 used |= CEPH_CAP_PIN;
 809         if (ci->i_rd_ref)
 810                 used |= CEPH_CAP_FILE_RD;
 811         if (ci->i_rdcache_ref || ci->i_rdcache_gen)
 812                 used |= CEPH_CAP_FILE_CACHE;
 813         if (ci->i_wr_ref)
 814                 used |= CEPH_CAP_FILE_WR;
 815         if (ci->i_wrbuffer_ref)
 816                 used |= CEPH_CAP_FILE_BUFFER;
 817         return used;
 818 }
 819
 820 /*
 821  * wanted, by virtue of open file modes
 822  */
 823 int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 824 {
 825         int want = 0;
 826         int mode;
 827         for (mode = 0; mode < 4; mode++)
 828                 if (ci->i_nr_by_mode[mode])
 829                         want |= ceph_caps_for_mode(mode);
 830         return want;
 831 }
 832
 833 /*
 834  * Return caps we have registered with the MDS(s) as 'wanted'.
 835  */
 836 int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
 837 {
 838         struct ceph_cap *cap;
 839         struct rb_node *p;
 840         int mds_wanted = 0;
 841
 842         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 843                 cap = rb_entry(p, struct ceph_cap, ci_node);
 844                 if (!__cap_is_valid(cap))
 845                         continue;
 846                 mds_wanted |= cap->mds_wanted;
 847         }
 848         return mds_wanted;
 849 }
 850
 851 /*
 852  * called under i_lock
 853  */
 854 static int __ceph_is_any_caps(struct ceph_inode_info *ci)
 855 {
 856         return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
 857 }
 858
 859 /*
 860  * caller should hold i_lock.
 861  * caller will not hold session s_mutex if called from destroy_inode.
 862  */
 863 void __ceph_remove_cap(struct ceph_cap *cap)
 864 {
 865         struct ceph_mds_session *session = cap->session;
 866         struct ceph_inode_info *ci = cap->ci;
 867         struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
 868
 869         dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
 870
 871         /* remove from inode list */
 872         rb_erase(&cap->ci_node, &ci->i_caps);
 873         cap->ci = NULL;
 874         if (ci->i_auth_cap == cap)
 875                 ci->i_auth_cap = NULL;
 876
 877         /* remove from session list */
 878         spin_lock(&session->s_cap_lock);
 879         if (session->s_cap_iterator == cap) {
 880                 /* not yet, we are iterating over this very cap */
 881                 dout("__ceph_remove_cap  delaying %p removal from session %p\n",
 882                      cap, cap->session);
 883         } else {
 884                 list_del_init(&cap->session_caps);
 885                 session->s_nr_caps--;
 886                 cap->session = NULL;
 887         }
 888         spin_unlock(&session->s_cap_lock);
 889
 890         if (cap->session == NULL)
 891                 ceph_put_cap(cap);
 892
 893         if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
 894                 struct ceph_snap_realm *realm = ci->i_snap_realm;
 895                 spin_lock(&realm->inodes_with_caps_lock);
 896                 list_del_init(&ci->i_snap_realm_item);
 897                 ci->i_snap_realm_counter++;
 898                 ci->i_snap_realm = NULL;
 899                 spin_unlock(&realm->inodes_with_caps_lock);
 900                 ceph_put_snap_realm(mdsc, realm);
 901         }
 902         if (!__ceph_is_any_real_caps(ci))
 903                 __cap_delay_cancel(mdsc, ci);
 904 }
 905
 906 /*
 907  * Build and send a cap message to the given MDS.
 908  *
 909  * Caller should be holding s_mutex.
 910  */
 911 static int send_cap_msg(struct ceph_mds_session *session,
 912                         u64 ino, u64 cid, int op,
 913                         int caps, int wanted, int dirty,
 914                         u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
 915                         u64 size, u64 max_size,
 916                         struct timespec *mtime, struct timespec *atime,
 917                         u64 time_warp_seq,
 918                         uid_t uid, gid_t gid, mode_t mode,
 919                         u64 xattr_version,
 920                         struct ceph_buffer *xattrs_buf,
 921                         u64 follows)
 922 {
 923         struct ceph_mds_caps *fc;
 924         struct ceph_msg *msg;
 925
 926         dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
 927              " seq %u/%u mseq %u follows %lld size %llu/%llu"
 928              " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
 929              cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
 930              ceph_cap_string(dirty),
 931              seq, issue_seq, mseq, follows, size, max_size,
 932              xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
 933
 934         msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
 935         if (IS_ERR(msg))
 936                 return PTR_ERR(msg);
 937
 938         msg->hdr.tid = cpu_to_le64(flush_tid);
 939
 940         fc = msg->front.iov_base;
 941         memset(fc, 0, sizeof(*fc));
 942
 943         fc->cap_id = cpu_to_le64(cid);
 944         fc->op = cpu_to_le32(op);
 945         fc->seq = cpu_to_le32(seq);
 946         fc->issue_seq = cpu_to_le32(issue_seq);
 947         fc->migrate_seq = cpu_to_le32(mseq);
 948         fc->caps = cpu_to_le32(caps);
 949         fc->wanted = cpu_to_le32(wanted);
 950         fc->dirty = cpu_to_le32(dirty);
 951         fc->ino = cpu_to_le64(ino);
 952         fc->snap_follows = cpu_to_le64(follows);
 953
 954         fc->size = cpu_to_le64(size);
 955         fc->max_size = cpu_to_le64(max_size);
 956         if (mtime)
 957                 ceph_encode_timespec(&fc->mtime, mtime);
 958         if (atime)
 959                 ceph_encode_timespec(&fc->atime, atime);
 960         fc->time_warp_seq = cpu_to_le32(time_warp_seq);
 961
 962         fc->uid = cpu_to_le32(uid);
 963         fc->gid = cpu_to_le32(gid);
 964         fc->mode = cpu_to_le32(mode);
 965
 966         fc->xattr_version = cpu_to_le64(xattr_version);
 967         if (xattrs_buf) {
 968                 msg->middle = ceph_buffer_get(xattrs_buf);
 969                 fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
 970                 msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
 971         }
 972
 973         ceph_con_send(&session->s_con, msg);
 974         return 0;
 975 }
 976
 977 /*
 978  * Queue cap releases when an inode is dropped from our cache.  Since
 979  * inode is about to be destroyed, there is no need for i_lock.
 980  */
 981 void ceph_queue_caps_release(struct inode *inode)
 982 {
 983         struct ceph_inode_info *ci = ceph_inode(inode);
 984         struct rb_node *p;
 985
 986         p = rb_first(&ci->i_caps);
 987         while (p) {
 988                 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
 989                 struct ceph_mds_session *session = cap->session;
 990                 struct ceph_msg *msg;
 991                 struct ceph_mds_cap_release *head;
 992                 struct ceph_mds_cap_item *item;
 993
 994                 spin_lock(&session->s_cap_lock);
 995                 BUG_ON(!session->s_num_cap_releases);
 996                 msg = list_first_entry(&session->s_cap_releases,
 997                                        struct ceph_msg, list_head);
 998
 999                 dout(" adding %p release to mds%d msg %p (%d left)\n",
1000                      inode, session->s_mds, msg, session->s_num_cap_releases);
1001
1002                 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1003                 head = msg->front.iov_base;
1004                 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1005                 item = msg->front.iov_base + msg->front.iov_len;
1006                 item->ino = cpu_to_le64(ceph_ino(inode));
1007                 item->cap_id = cpu_to_le64(cap->cap_id);
1008                 item->migrate_seq = cpu_to_le32(cap->mseq);
1009                 item->seq = cpu_to_le32(cap->issue_seq);
1010
1011                 session->s_num_cap_releases--;
1012
1013                 msg->front.iov_len += sizeof(*item);
1014                 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1015                         dout(" release msg %p full\n", msg);
1016                         list_move_tail(&msg->list_head,
1017                                        &session->s_cap_releases_done);
1018                 } else {
1019                         dout(" release msg %p at %d/%d (%d)\n", msg,
1020                              (int)le32_to_cpu(head->num),
1021                              (int)CEPH_CAPS_PER_RELEASE,
1022                              (int)msg->front.iov_len);
1023                 }
1024                 spin_unlock(&session->s_cap_lock);
1025                 p = rb_next(p);
1026                 __ceph_remove_cap(cap);
1027         }
1028 }
1029
1030 /*
1031  * Send a cap msg on the given inode.  Update our caps state, then
1032  * drop i_lock and send the message.
1033  *
1034  * Make note of max_size reported/requested from mds, revoked caps
1035  * that have now been implemented.
1036  *
1037  * Make half-hearted attempt ot to invalidate page cache if we are
1038  * dropping RDCACHE.  Note that this will leave behind locked pages
1039  * that we'll then need to deal with elsewhere.
1040  *
1041  * Return non-zero if delayed release, or we experienced an error
1042  * such that the caller should requeue + retry later.
1043  *
1044  * called with i_lock, then drops it.
1045  * caller should hold snap_rwsem (read), s_mutex.
1046  */
1047 static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1048                       int op, int used, int want, int retain, int flushing,
1049                       unsigned *pflush_tid)
1050         __releases(cap->ci->vfs_inode->i_lock)
1051 {
1052         struct ceph_inode_info *ci = cap->ci;
1053         struct inode *inode = &ci->vfs_inode;
1054         u64 cap_id = cap->cap_id;
1055         int held, revoking, dropping, keep;
1056         u64 seq, issue_seq, mseq, time_warp_seq, follows;
1057         u64 size, max_size;
1058         struct timespec mtime, atime;
1059         int wake = 0;
1060         mode_t mode;
1061         uid_t uid;
1062         gid_t gid;
1063         struct ceph_mds_session *session;
1064         u64 xattr_version = 0;
1065         int delayed = 0;
1066         u64 flush_tid = 0;
1067         int i;
1068         int ret;
1069
1070         held = cap->issued | cap->implemented;
1071         revoking = cap->implemented & ~cap->issued;
1072         retain &= ~revoking;
1073         dropping = cap->issued & ~retain;
1074
1075         dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1076              inode, cap, cap->session,
1077              ceph_cap_string(held), ceph_cap_string(held & retain),
1078              ceph_cap_string(revoking));
1079         BUG_ON((retain & CEPH_CAP_PIN) == 0);
1080
1081         session = cap->session;
1082
1083         /* don't release wanted unless we've waited a bit. */
1084         if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1085             time_before(jiffies, ci->i_hold_caps_min)) {
1086                 dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1087                      ceph_cap_string(cap->issued),
1088                      ceph_cap_string(cap->issued & retain),
1089                      ceph_cap_string(cap->mds_wanted),
1090                      ceph_cap_string(want));
1091                 want |= cap->mds_wanted;
1092                 retain |= cap->issued;
1093                 delayed = 1;
1094         }
1095         ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
1096
1097         cap->issued &= retain;  /* drop bits we don't want */
1098         if (cap->implemented & ~cap->issued) {
1099                 /*
1100                  * Wake up any waiters on wanted -> needed transition.
1101                  * This is due to the weird transition from buffered
1102                  * to sync IO... we need to flush dirty pages _before_
1103                  * allowing sync writes to avoid reordering.
1104                  */
1105                 wake = 1;
1106         }
1107         cap->implemented &= cap->issued | used;
1108         cap->mds_wanted = want;
1109
1110         if (flushing) {
1111                 /*
1112                  * assign a tid for flush operations so we can avoid
1113                  * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
1114                  * clean type races.  track latest tid for every bit
1115                  * so we can handle flush AxFw, flush Fw, and have the
1116                  * first ack clean Ax.
1117                  */
1118                 flush_tid = ++ci->i_cap_flush_last_tid;
1119                 if (pflush_tid)
1120                         *pflush_tid = flush_tid;
1121                 dout(" cap_flush_tid %d\n", (int)flush_tid);
1122                 for (i = 0; i < CEPH_CAP_BITS; i++)
1123                         if (flushing & (1 << i))
1124                                 ci->i_cap_flush_tid[i] = flush_tid;
1125         }
1126
1127         keep = cap->implemented;
1128         seq = cap->seq;
1129         issue_seq = cap->issue_seq;
1130         mseq = cap->mseq;
1131         size = inode->i_size;
1132         ci->i_reported_size = size;
1133         max_size = ci->i_wanted_max_size;
1134         ci->i_requested_max_size = max_size;
1135         mtime = inode->i_mtime;
1136         atime = inode->i_atime;
1137         time_warp_seq = ci->i_time_warp_seq;
1138         follows = ci->i_snap_realm->cached_context->seq;
1139         uid = inode->i_uid;
1140         gid = inode->i_gid;
1141         mode = inode->i_mode;
1142
1143         if (dropping & CEPH_CAP_XATTR_EXCL) {
1144                 __ceph_build_xattrs_blob(ci);
1145                 xattr_version = ci->i_xattrs.version + 1;
1146         }
1147
1148         spin_unlock(&inode->i_lock);
1149
1150         ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1151                 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1152                 size, max_size, &mtime, &atime, time_warp_seq,
1153                 uid, gid, mode,
1154                 xattr_version,
1155                 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1156                 follows);
1157         if (ret < 0) {
1158                 dout("error sending cap msg, must requeue %p\n", inode);
1159                 delayed = 1;
1160         }
1161
1162         if (wake)
1163                 wake_up(&ci->i_cap_wq);
1164
1165         return delayed;
1166 }
1167
1168 /*
1169  * When a snapshot is taken, clients accumulate dirty metadata on
1170  * inodes with capabilities in ceph_cap_snaps to describe the file
1171  * state at the time the snapshot was taken.  This must be flushed
1172  * asynchronously back to the MDS once sync writes complete and dirty
1173  * data is written out.
1174  *
1175  * Called under i_lock.  Takes s_mutex as needed.
1176  */
1177 void __ceph_flush_snaps(struct ceph_inode_info *ci,
1178                         struct ceph_mds_session **psession)
1179 {
1180         struct inode *inode = &ci->vfs_inode;
1181         int mds;
1182         struct ceph_cap_snap *capsnap;
1183         u32 mseq;
1184         struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1185         struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1186                                                     session->s_mutex */
1187         u64 next_follows = 0;  /* keep track of how far we've gotten through the
1188                              i_cap_snaps list, and skip these entries next time
1189                              around to avoid an infinite loop */
1190
1191         if (psession)
1192                 session = *psession;
1193
1194         dout("__flush_snaps %p\n", inode);
1195 retry:
1196         list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1197                 /* avoid an infiniute loop after retry */
1198                 if (capsnap->follows < next_follows)
1199                         continue;
1200                 /*
1201                  * we need to wait for sync writes to complete and for dirty
1202                  * pages to be written out.
1203                  */
1204                 if (capsnap->dirty_pages || capsnap->writing)
1205                         continue;
1206
1207                 /*
1208                  * if cap writeback already occurred, we should have dropped
1209                  * the capsnap in ceph_put_wrbuffer_cap_refs.
1210                  */
1211                 BUG_ON(capsnap->dirty == 0);
1212
1213                 /* pick mds, take s_mutex */
1214                 mds = __ceph_get_cap_mds(ci, &mseq);
1215                 if (session && session->s_mds != mds) {
1216                         dout("oops, wrong session %p mutex\n", session);
1217                         mutex_unlock(&session->s_mutex);
1218                         ceph_put_mds_session(session);
1219                         session = NULL;
1220                 }
1221                 if (!session) {
1222                         spin_unlock(&inode->i_lock);
1223                         mutex_lock(&mdsc->mutex);
1224                         session = __ceph_lookup_mds_session(mdsc, mds);
1225                         mutex_unlock(&mdsc->mutex);
1226                         if (session) {
1227                                 dout("inverting session/ino locks on %p\n",
1228                                      session);
1229                                 mutex_lock(&session->s_mutex);
1230                         }
1231                         /*
1232                          * if session == NULL, we raced against a cap
1233                          * deletion.  retry, and we'll get a better
1234                          * @mds value next time.
1235                          */
1236                         spin_lock(&inode->i_lock);
1237                         goto retry;
1238                 }
1239
1240                 capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
1241                 atomic_inc(&capsnap->nref);
1242                 if (!list_empty(&capsnap->flushing_item))
1243                         list_del_init(&capsnap->flushing_item);
1244                 list_add_tail(&capsnap->flushing_item,
1245                               &session->s_cap_snaps_flushing);
1246                 spin_unlock(&inode->i_lock);
1247
1248                 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
1249                      inode, capsnap, next_follows, capsnap->size);
1250                 send_cap_msg(session, ceph_vino(inode).ino, 0,
1251                              CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1252                              capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
1253                              capsnap->size, 0,
1254                              &capsnap->mtime, &capsnap->atime,
1255                              capsnap->time_warp_seq,
1256                              capsnap->uid, capsnap->gid, capsnap->mode,
1257                              0, NULL,
1258                              capsnap->follows);
1259
1260                 next_follows = capsnap->follows + 1;
1261                 ceph_put_cap_snap(capsnap);
1262
1263                 spin_lock(&inode->i_lock);
1264                 goto retry;
1265         }
1266
1267         /* we flushed them all; remove this inode from the queue */
1268         spin_lock(&mdsc->snap_flush_lock);
1269         list_del_init(&ci->i_snap_flush_item);
1270         spin_unlock(&mdsc->snap_flush_lock);
1271
1272         if (psession)
1273                 *psession = session;
1274         else if (session) {
1275                 mutex_unlock(&session->s_mutex);
1276                 ceph_put_mds_session(session);
1277         }
1278 }
1279
1280 static void ceph_flush_snaps(struct ceph_inode_info *ci)
1281 {
1282         struct inode *inode = &ci->vfs_inode;
1283
1284         spin_lock(&inode->i_lock);
1285         __ceph_flush_snaps(ci, NULL);
1286         spin_unlock(&inode->i_lock);
1287 }
1288
1289 /*
1290  * Mark caps dirty.  If inode is newly dirty, add to the global dirty
1291  * list.
1292  */
1293 void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1294 {
1295         struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
1296         struct inode *inode = &ci->vfs_inode;
1297         int was = ci->i_dirty_caps;
1298         int dirty = 0;
1299
1300         dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1301              ceph_cap_string(mask), ceph_cap_string(was),
1302              ceph_cap_string(was | mask));
1303         ci->i_dirty_caps |= mask;
1304         if (was == 0) {
1305                 dout(" inode %p now dirty\n", &ci->vfs_inode);
1306                 BUG_ON(!list_empty(&ci->i_dirty_item));
1307                 spin_lock(&mdsc->cap_dirty_lock);
1308                 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1309                 spin_unlock(&mdsc->cap_dirty_lock);
1310                 if (ci->i_flushing_caps == 0) {
1311                         igrab(inode);
1312                         dirty |= I_DIRTY_SYNC;
1313                 }
1314         }
1315         BUG_ON(list_empty(&ci->i_dirty_item));
1316         if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1317             (mask & CEPH_CAP_FILE_BUFFER))
1318                 dirty |= I_DIRTY_DATASYNC;
1319         if (dirty)
1320                 __mark_inode_dirty(inode, dirty);
1321         __cap_delay_requeue(mdsc, ci);
1322 }
1323
1324 /*
1325  * Add dirty inode to the flushing list.  Assigned a seq number so we
1326  * can wait for caps to flush without starving.
1327  *
1328  * Called under i_lock.
1329  */
1330 static int __mark_caps_flushing(struct inode *inode,
1331                                  struct ceph_mds_session *session)
1332 {
1333         struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1334         struct ceph_inode_info *ci = ceph_inode(inode);
1335         int flushing;
1336
1337         BUG_ON(ci->i_dirty_caps == 0);
1338         BUG_ON(list_empty(&ci->i_dirty_item));
1339
1340         flushing = ci->i_dirty_caps;
1341         dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1342              ceph_cap_string(flushing),
1343              ceph_cap_string(ci->i_flushing_caps),
1344              ceph_cap_string(ci->i_flushing_caps | flushing));
1345         ci->i_flushing_caps |= flushing;
1346         ci->i_dirty_caps = 0;
1347         dout(" inode %p now !dirty\n", inode);
1348
1349         spin_lock(&mdsc->cap_dirty_lock);
1350         list_del_init(&ci->i_dirty_item);
1351
1352         ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1353         if (list_empty(&ci->i_flushing_item)) {
1354                 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1355                 mdsc->num_cap_flushing++;
1356                 dout(" inode %p now flushing seq %lld\n", inode,
1357                      ci->i_cap_flush_seq);
1358         } else {
1359                 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1360                 dout(" inode %p now flushing (more) seq %lld\n", inode,
1361                      ci->i_cap_flush_seq);
1362         }
1363         spin_unlock(&mdsc->cap_dirty_lock);
1364
1365         return flushing;
1366 }
1367
1368 /*
1369  * try to invalidate mapping pages without blocking.
1370  */
1371 static int mapping_is_empty(struct address_space *mapping)
1372 {
1373         struct page *page = find_get_page(mapping, 0);
1374
1375         if (!page)
1376                 return 1;
1377
1378         put_page(page);
1379         return 0;
1380 }
1381
1382 static int try_nonblocking_invalidate(struct inode *inode)
1383 {
1384         struct ceph_inode_info *ci = ceph_inode(inode);
1385         u32 invalidating_gen = ci->i_rdcache_gen;
1386
1387         spin_unlock(&inode->i_lock);
1388         invalidate_mapping_pages(&inode->i_data, 0, -1);
1389         spin_lock(&inode->i_lock);
1390
1391         if (mapping_is_empty(&inode->i_data) &&
1392             invalidating_gen == ci->i_rdcache_gen) {
1393                 /* success. */
1394                 dout("try_nonblocking_invalidate %p success\n", inode);
1395                 ci->i_rdcache_gen = 0;
1396                 ci->i_rdcache_revoking = 0;
1397                 return 0;
1398         }
1399         dout("try_nonblocking_invalidate %p failed\n", inode);
1400         return -1;
1401 }
1402
1403 /*
1404  * Swiss army knife function to examine currently used and wanted
1405  * versus held caps.  Release, flush, ack revoked caps to mds as
1406  * appropriate.
1407  *
1408  *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1409  *    cap release further.
1410  *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
1411  *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1412  *    further delay.
1413  */
1414 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1415                      struct ceph_mds_session *session)
1416         __releases(session->s_mutex)
1417 {
1418         struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1419         struct ceph_mds_client *mdsc = &client->mdsc;
1420         struct inode *inode = &ci->vfs_inode;
1421         struct ceph_cap *cap;
1422         int file_wanted, used;
1423         int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
1424         int issued, implemented, want, retain, revoking, flushing = 0;
1425         int mds = -1;   /* keep track of how far we've gone through i_caps list
1426                            to avoid an infinite loop on retry */
1427         struct rb_node *p;
1428         int tried_invalidate = 0;
1429         int delayed = 0, sent = 0, force_requeue = 0, num;
1430         int queue_invalidate = 0;
1431         int is_delayed = flags & CHECK_CAPS_NODELAY;
1432
1433         /* if we are unmounting, flush any unused caps immediately. */
1434         if (mdsc->stopping)
1435                 is_delayed = 1;
1436
1437         spin_lock(&inode->i_lock);
1438
1439         if (ci->i_ceph_flags & CEPH_I_FLUSH)
1440                 flags |= CHECK_CAPS_FLUSH;
1441
1442         /* flush snaps first time around only */
1443         if (!list_empty(&ci->i_cap_snaps))
1444                 __ceph_flush_snaps(ci, &session);
1445         goto retry_locked;
1446 retry:
1447         spin_lock(&inode->i_lock);
1448 retry_locked:
1449         file_wanted = __ceph_caps_file_wanted(ci);
1450         used = __ceph_caps_used(ci);
1451         want = file_wanted | used;
1452         issued = __ceph_caps_issued(ci, &implemented);
1453         revoking = implemented & ~issued;
1454
1455         retain = want | CEPH_CAP_PIN;
1456         if (!mdsc->stopping && inode->i_nlink > 0) {
1457                 if (want) {
1458                         retain |= CEPH_CAP_ANY;       /* be greedy */
1459                 } else {
1460                         retain |= CEPH_CAP_ANY_SHARED;
1461                         /*
1462                          * keep RD only if we didn't have the file open RW,
1463                          * because then the mds would revoke it anyway to
1464                          * journal max_size=0.
1465                          */
1466                         if (ci->i_max_size == 0)
1467                                 retain |= CEPH_CAP_ANY_RD;
1468                 }
1469         }
1470
1471         dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1472              " issued %s revoking %s retain %s %s%s%s\n", inode,
1473              ceph_cap_string(file_wanted),
1474              ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1475              ceph_cap_string(ci->i_flushing_caps),
1476              ceph_cap_string(issued), ceph_cap_string(revoking),
1477              ceph_cap_string(retain),
1478              (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1479              (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
1480              (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1481
1482         /*
1483          * If we no longer need to hold onto old our caps, and we may
1484          * have cached pages, but don't want them, then try to invalidate.
1485          * If we fail, it's because pages are locked.... try again later.
1486          */
1487         if ((!is_delayed || mdsc->stopping) &&
1488             ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
1489             ci->i_rdcache_gen &&                     /* may have cached pages */
1490             (file_wanted == 0 ||                     /* no open files */
1491              (revoking & CEPH_CAP_FILE_CACHE)) &&     /*  or revoking cache */
1492             !tried_invalidate) {
1493                 dout("check_caps trying to invalidate on %p\n", inode);
1494                 if (try_nonblocking_invalidate(inode) < 0) {
1495                         if (revoking & CEPH_CAP_FILE_CACHE) {
1496                                 dout("check_caps queuing invalidate\n");
1497                                 queue_invalidate = 1;
1498                                 ci->i_rdcache_revoking = ci->i_rdcache_gen;
1499                         } else {
1500                                 dout("check_caps failed to invalidate pages\n");
1501                                 /* we failed to invalidate pages.  check these
1502                                    caps again later. */
1503                                 force_requeue = 1;
1504                                 __cap_set_timeouts(mdsc, ci);
1505                         }
1506                 }
1507                 tried_invalidate = 1;
1508                 goto retry_locked;
1509         }
1510
1511         num = 0;
1512         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1513                 cap = rb_entry(p, struct ceph_cap, ci_node);
1514                 num++;
1515
1516                 /* avoid looping forever */
1517                 if (mds >= cap->mds ||
1518                     ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
1519                         continue;
1520
1521                 /* NOTE: no side-effects allowed, until we take s_mutex */
1522
1523                 revoking = cap->implemented & ~cap->issued;
1524                 if (revoking)
1525                         dout(" mds%d revoking %s\n", cap->mds,
1526                              ceph_cap_string(revoking));
1527
1528                 if (cap == ci->i_auth_cap &&
1529                     (cap->issued & CEPH_CAP_FILE_WR)) {
1530                         /* request larger max_size from MDS? */
1531                         if (ci->i_wanted_max_size > ci->i_max_size &&
1532                             ci->i_wanted_max_size > ci->i_requested_max_size) {
1533                                 dout("requesting new max_size\n");
1534                                 goto ack;
1535                         }
1536
1537                         /* approaching file_max? */
1538                         if ((inode->i_size << 1) >= ci->i_max_size &&
1539                             (ci->i_reported_size << 1) < ci->i_max_size) {
1540                                 dout("i_size approaching max_size\n");
1541                                 goto ack;
1542                         }
1543                 }
1544                 /* flush anything dirty? */
1545                 if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
1546                     ci->i_dirty_caps) {
1547                         dout("flushing dirty caps\n");
1548                         goto ack;
1549                 }
1550
1551                 /* completed revocation? going down and there are no caps? */
1552                 if (revoking && (revoking & used) == 0) {
1553                         dout("completed revocation of %s\n",
1554                              ceph_cap_string(cap->implemented & ~cap->issued));
1555                         goto ack;
1556                 }
1557
1558                 /* want more caps from mds? */
1559                 if (want & ~(cap->mds_wanted | cap->issued))
1560                         goto ack;
1561
1562                 /* things we might delay */
1563                 if ((cap->issued & ~retain) == 0 &&
1564                     cap->mds_wanted == want)
1565                         continue;     /* nope, all good */
1566
1567                 if (is_delayed)
1568                         goto ack;
1569
1570                 /* delay? */
1571                 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1572                     time_before(jiffies, ci->i_hold_caps_max)) {
1573                         dout(" delaying issued %s -> %s, wanted %s -> %s\n",
1574                              ceph_cap_string(cap->issued),
1575                              ceph_cap_string(cap->issued & retain),
1576                              ceph_cap_string(cap->mds_wanted),
1577                              ceph_cap_string(want));
1578                         delayed++;
1579                         continue;
1580                 }
1581
1582 ack:
1583                 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1584                         dout(" skipping %p I_NOFLUSH set\n", inode);
1585                         continue;
1586                 }
1587
1588                 if (session && session != cap->session) {
1589                         dout("oops, wrong session %p mutex\n", session);
1590                         mutex_unlock(&session->s_mutex);
1591                         session = NULL;
1592                 }
1593                 if (!session) {
1594                         session = cap->session;
1595                         if (mutex_trylock(&session->s_mutex) == 0) {
1596                                 dout("inverting session/ino locks on %p\n",
1597                                      session);
1598                                 spin_unlock(&inode->i_lock);
1599                                 if (took_snap_rwsem) {
1600                                         up_read(&mdsc->snap_rwsem);
1601                                         took_snap_rwsem = 0;
1602                                 }
1603                                 mutex_lock(&session->s_mutex);
1604                                 goto retry;
1605                         }
1606                 }
1607                 /* take snap_rwsem after session mutex */
1608                 if (!took_snap_rwsem) {
1609                         if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
1610                                 dout("inverting snap/in locks on %p\n",
1611                                      inode);
1612                                 spin_unlock(&inode->i_lock);
1613                                 down_read(&mdsc->snap_rwsem);
1614                                 took_snap_rwsem = 1;
1615                                 goto retry;
1616                         }
1617                         took_snap_rwsem = 1;
1618                 }
1619
1620                 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1621                         flushing = __mark_caps_flushing(inode, session);
1622
1623                 mds = cap->mds;  /* remember mds, so we don't repeat */
1624                 sent++;
1625
1626                 /* __send_cap drops i_lock */
1627                 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
1628                                       retain, flushing, NULL);
1629                 goto retry; /* retake i_lock and restart our cap scan. */
1630         }
1631
1632         /*
1633          * Reschedule delayed caps release if we delayed anything,
1634          * otherwise cancel.
1635          */
1636         if (delayed && is_delayed)
1637                 force_requeue = 1;   /* __send_cap delayed release; requeue */
1638         if (!delayed && !is_delayed)
1639                 __cap_delay_cancel(mdsc, ci);
1640         else if (!is_delayed || force_requeue)
1641                 __cap_delay_requeue(mdsc, ci);
1642
1643         spin_unlock(&inode->i_lock);
1644
1645         if (queue_invalidate)
1646                 ceph_queue_invalidate(inode);
1647
1648         if (session)
1649                 mutex_unlock(&session->s_mutex);
1650         if (took_snap_rwsem)
1651                 up_read(&mdsc->snap_rwsem);
1652 }
1653
1654 /*
1655  * Try to flush dirty caps back to the auth mds.
1656  */
1657 static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1658                           unsigned *flush_tid)
1659 {
1660         struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1661         struct ceph_inode_info *ci = ceph_inode(inode);
1662         int unlock_session = session ? 0 : 1;
1663         int flushing = 0;
1664
1665 retry:
1666         spin_lock(&inode->i_lock);
1667         if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1668                 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
1669                 goto out;
1670         }
1671         if (ci->i_dirty_caps && ci->i_auth_cap) {
1672                 struct ceph_cap *cap = ci->i_auth_cap;
1673                 int used = __ceph_caps_used(ci);
1674                 int want = __ceph_caps_wanted(ci);
1675                 int delayed;
1676
1677                 if (!session) {
1678                         spin_unlock(&inode->i_lock);
1679                         session = cap->session;
1680                         mutex_lock(&session->s_mutex);
1681                         goto retry;
1682                 }
1683                 BUG_ON(session != cap->session);
1684                 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1685                         goto out;
1686
1687                 flushing = __mark_caps_flushing(inode, session);
1688
1689                 /* __send_cap drops i_lock */
1690                 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1691                                      cap->issued | cap->implemented, flushing,
1692                                      flush_tid);
1693                 if (!delayed)
1694                         goto out_unlocked;
1695
1696                 spin_lock(&inode->i_lock);
1697                 __cap_delay_requeue(mdsc, ci);
1698         }
1699 out:
1700         spin_unlock(&inode->i_lock);
1701 out_unlocked:
1702         if (session && unlock_session)
1703                 mutex_unlock(&session->s_mutex);
1704         return flushing;
1705 }
1706
1707 /*
1708  * Return true if we've flushed caps through the given flush_tid.
1709  */
1710 static int caps_are_flushed(struct inode *inode, unsigned tid)
1711 {
1712         struct ceph_inode_info *ci = ceph_inode(inode);
1713         int dirty, i, ret = 1;
1714
1715         spin_lock(&inode->i_lock);
1716         dirty = __ceph_caps_dirty(ci);
1717         for (i = 0; i < CEPH_CAP_BITS; i++)
1718                 if ((ci->i_flushing_caps & (1 << i)) &&
1719                     ci->i_cap_flush_tid[i] <= tid) {
1720                         /* still flushing this bit */
1721                         ret = 0;
1722                         break;
1723                 }
1724         spin_unlock(&inode->i_lock);
1725         return ret;
1726 }
1727
1728 /*
1729  * Wait on any unsafe replies for the given inode.  First wait on the
1730  * newest request, and make that the upper bound.  Then, if there are
1731  * more requests, keep waiting on the oldest as long as it is still older
1732  * than the original request.
1733  */
1734 static void sync_write_wait(struct inode *inode)
1735 {
1736         struct ceph_inode_info *ci = ceph_inode(inode);
1737         struct list_head *head = &ci->i_unsafe_writes;
1738         struct ceph_osd_request *req;
1739         u64 last_tid;
1740
1741         spin_lock(&ci->i_unsafe_lock);
1742         if (list_empty(head))
1743                 goto out;
1744
1745         /* set upper bound as _last_ entry in chain */
1746         req = list_entry(head->prev, struct ceph_osd_request,
1747                          r_unsafe_item);
1748         last_tid = req->r_tid;
1749
1750         do {
1751                 ceph_osdc_get_request(req);
1752                 spin_unlock(&ci->i_unsafe_lock);
1753                 dout("sync_write_wait on tid %llu (until %llu)\n",
1754                      req->r_tid, last_tid);
1755                 wait_for_completion(&req->r_safe_completion);
1756                 spin_lock(&ci->i_unsafe_lock);
1757                 ceph_osdc_put_request(req);
1758
1759                 /*
1760                  * from here on look at first entry in chain, since we
1761                  * only want to wait for anything older than last_tid
1762                  */
1763                 if (list_empty(head))
1764                         break;
1765                 req = list_entry(head->next, struct ceph_osd_request,
1766                                  r_unsafe_item);
1767         } while (req->r_tid < last_tid);
1768 out:
1769         spin_unlock(&ci->i_unsafe_lock);
1770 }
1771
1772 int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
1773 {
1774         struct inode *inode = dentry->d_inode;
1775         struct ceph_inode_info *ci = ceph_inode(inode);
1776         unsigned flush_tid;
1777         int ret;
1778         int dirty;
1779
1780         dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
1781         sync_write_wait(inode);
1782
1783         ret = filemap_write_and_wait(inode->i_mapping);
1784         if (ret < 0)
1785                 return ret;
1786
1787         dirty = try_flush_caps(inode, NULL, &flush_tid);
1788         dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
1789
1790         /*
1791          * only wait on non-file metadata writeback (the mds
1792          * can recover size and mtime, so we don't need to
1793          * wait for that)
1794          */
1795         if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
1796                 dout("fsync waiting for flush_tid %u\n", flush_tid);
1797                 ret = wait_event_interruptible(ci->i_cap_wq,
1798                                        caps_are_flushed(inode, flush_tid));
1799         }
1800
1801         dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
1802         return ret;
1803 }
1804
1805 /*
1806  * Flush any dirty caps back to the mds.  If we aren't asked to wait,
1807  * queue inode for flush but don't do so immediately, because we can
1808  * get by with fewer MDS messages if we wait for data writeback to
1809  * complete first.
1810  */
1811 int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1812 {
1813         struct ceph_inode_info *ci = ceph_inode(inode);
1814         unsigned flush_tid;
1815         int err = 0;
1816         int dirty;
1817         int wait = wbc->sync_mode == WB_SYNC_ALL;
1818
1819         dout("write_inode %p wait=%d\n", inode, wait);
1820         if (wait) {
1821                 dirty = try_flush_caps(inode, NULL, &flush_tid);
1822                 if (dirty)
1823                         err = wait_event_interruptible(ci->i_cap_wq,
1824                                        caps_are_flushed(inode, flush_tid));
1825         } else {
1826                 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1827
1828                 spin_lock(&inode->i_lock);
1829                 if (__ceph_caps_dirty(ci))
1830                         __cap_delay_requeue_front(mdsc, ci);
1831                 spin_unlock(&inode->i_lock);
1832         }
1833         return err;
1834 }
1835
1836 /*
1837  * After a recovering MDS goes active, we need to resend any caps
1838  * we were flushing.
1839  *
1840  * Caller holds session->s_mutex.
1841  */
1842 static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1843                                    struct ceph_mds_session *session)
1844 {
1845         struct ceph_cap_snap *capsnap;
1846
1847         dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
1848         list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
1849                             flushing_item) {
1850                 struct ceph_inode_info *ci = capsnap->ci;
1851                 struct inode *inode = &ci->vfs_inode;
1852                 struct ceph_cap *cap;
1853
1854                 spin_lock(&inode->i_lock);
1855                 cap = ci->i_auth_cap;
1856                 if (cap && cap->session == session) {
1857                         dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1858                              cap, capsnap);
1859                         __ceph_flush_snaps(ci, &session);
1860                 } else {
1861                         pr_err("%p auth cap %p not mds%d ???\n", inode,
1862                                cap, session->s_mds);
1863                         spin_unlock(&inode->i_lock);
1864                 }
1865         }
1866 }
1867
1868 void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1869                              struct ceph_mds_session *session)
1870 {
1871         struct ceph_inode_info *ci;
1872
1873         kick_flushing_capsnaps(mdsc, session);
1874
1875         dout("kick_flushing_caps mds%d\n", session->s_mds);
1876         list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
1877                 struct inode *inode = &ci->vfs_inode;
1878                 struct ceph_cap *cap;
1879                 int delayed = 0;
1880
1881                 spin_lock(&inode->i_lock);
1882                 cap = ci->i_auth_cap;
1883                 if (cap && cap->session == session) {
1884                         dout("kick_flushing_caps %p cap %p %s\n", inode,
1885                              cap, ceph_cap_string(ci->i_flushing_caps));
1886                         delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1887                                              __ceph_caps_used(ci),
1888                                              __ceph_caps_wanted(ci),
1889                                              cap->issued | cap->implemented,
1890                                              ci->i_flushing_caps, NULL);
1891                         if (delayed) {
1892                                 spin_lock(&inode->i_lock);
1893                                 __cap_delay_requeue(mdsc, ci);
1894                                 spin_unlock(&inode->i_lock);
1895                         }
1896                 } else {
1897                         pr_err("%p auth cap %p not mds%d ???\n", inode,
1898                                cap, session->s_mds);
1899                         spin_unlock(&inode->i_lock);
1900                 }
1901         }
1902 }
1903
1904
1905 /*
1906  * Take references to capabilities we hold, so that we don't release
1907  * them to the MDS prematurely.
1908  *
1909  * Protected by i_lock.
1910  */
1911 static void __take_cap_refs(struct ceph_inode_info *ci, int got)
1912 {
1913         if (got & CEPH_CAP_PIN)
1914                 ci->i_pin_ref++;
1915         if (got & CEPH_CAP_FILE_RD)
1916                 ci->i_rd_ref++;
1917         if (got & CEPH_CAP_FILE_CACHE)
1918                 ci->i_rdcache_ref++;
1919         if (got & CEPH_CAP_FILE_WR)
1920                 ci->i_wr_ref++;
1921         if (got & CEPH_CAP_FILE_BUFFER) {
1922                 if (ci->i_wrbuffer_ref == 0)
1923                         igrab(&ci->vfs_inode);
1924                 ci->i_wrbuffer_ref++;
1925                 dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
1926                      &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
1927         }
1928 }
1929
1930 /*
1931  * Try to grab cap references.  Specify those refs we @want, and the
1932  * minimal set we @need.  Also include the larger offset we are writing
1933  * to (when applicable), and check against max_size here as well.
1934  * Note that caller is responsible for ensuring max_size increases are
1935  * requested from the MDS.
1936  */
1937 static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
1938                             int *got, loff_t endoff, int *check_max, int *err)
1939 {
1940         struct inode *inode = &ci->vfs_inode;
1941         int ret = 0;
1942         int have, implemented;
1943         int file_wanted;
1944
1945         dout("get_cap_refs %p need %s want %s\n", inode,
1946              ceph_cap_string(need), ceph_cap_string(want));
1947         spin_lock(&inode->i_lock);
1948
1949         /* make sure file is actually open */
1950         file_wanted = __ceph_caps_file_wanted(ci);
1951         if ((file_wanted & need) == 0) {
1952                 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
1953                      ceph_cap_string(need), ceph_cap_string(file_wanted));
1954                 *err = -EBADF;
1955                 ret = 1;
1956                 goto out;
1957         }
1958
1959         if (need & CEPH_CAP_FILE_WR) {
1960                 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
1961                         dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
1962                              inode, endoff, ci->i_max_size);
1963                         if (endoff > ci->i_wanted_max_size) {
1964                                 *check_max = 1;
1965                                 ret = 1;
1966                         }
1967                         goto out;
1968                 }
1969                 /*
1970                  * If a sync write is in progress, we must wait, so that we
1971                  * can get a final snapshot value for size+mtime.
1972                  */
1973                 if (__ceph_have_pending_cap_snap(ci)) {
1974                         dout("get_cap_refs %p cap_snap_pending\n", inode);
1975                         goto out;
1976                 }
1977         }
1978         have = __ceph_caps_issued(ci, &implemented);
1979
1980         /*
1981          * disallow writes while a truncate is pending
1982          */
1983         if (ci->i_truncate_pending)
1984                 have &= ~CEPH_CAP_FILE_WR;
1985
1986         if ((have & need) == need) {
1987                 /*
1988                  * Look at (implemented & ~have & not) so that we keep waiting
1989                  * on transition from wanted -> needed caps.  This is needed
1990                  * for WRBUFFER|WR -> WR to avoid a new WR sync write from
1991                  * going before a prior buffered writeback happens.
1992                  */
1993                 int not = want & ~(have & need);
1994                 int revoking = implemented & ~have;
1995                 dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
1996                      inode, ceph_cap_string(have), ceph_cap_string(not),
1997                      ceph_cap_string(revoking));
1998                 if ((revoking & not) == 0) {
1999                         *got = need | (have & want);
2000                         __take_cap_refs(ci, *got);
2001                         ret = 1;
2002                 }
2003         } else {
2004                 dout("get_cap_refs %p have %s needed %s\n", inode,
2005                      ceph_cap_string(have), ceph_cap_string(need));
2006         }
2007 out:
2008         spin_unlock(&inode->i_lock);
2009         dout("get_cap_refs %p ret %d got %s\n", inode,
2010              ret, ceph_cap_string(*got));
2011         return ret;
2012 }
2013
2014 /*
2015  * Check the offset we are writing up to against our current
2016  * max_size.  If necessary, tell the MDS we want to write to
2017  * a larger offset.
2018  */
2019 static void check_max_size(struct inode *inode, loff_t endoff)
2020 {
2021         struct ceph_inode_info *ci = ceph_inode(inode);
2022         int check = 0;
2023
2024         /* do we need to explicitly request a larger max_size? */
2025         spin_lock(&inode->i_lock);
2026         if ((endoff >= ci->i_max_size ||
2027              endoff > (inode->i_size << 1)) &&
2028             endoff > ci->i_wanted_max_size) {
2029                 dout("write %p at large endoff %llu, req max_size\n",
2030                      inode, endoff);
2031                 ci->i_wanted_max_size = endoff;
2032                 check = 1;
2033         }
2034         spin_unlock(&inode->i_lock);
2035         if (check)
2036                 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2037 }
2038
2039 /*
2040  * Wait for caps, and take cap references.  If we can't get a WR cap
2041  * due to a small max_size, make sure we check_max_size (and possibly
2042  * ask the mds) so we don't get hung up indefinitely.
2043  */
2044 int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
2045                   loff_t endoff)
2046 {
2047         int check_max, ret, err;
2048
2049 retry:
2050         if (endoff > 0)
2051                 check_max_size(&ci->vfs_inode, endoff);
2052         check_max = 0;
2053         err = 0;
2054         ret = wait_event_interruptible(ci->i_cap_wq,
2055                                        try_get_cap_refs(ci, need, want,
2056                                                         got, endoff,
2057                                                         &check_max, &err));
2058         if (err)
2059                 ret = err;
2060         if (check_max)
2061                 goto retry;
2062         return ret;
2063 }
2064
2065 /*
2066  * Take cap refs.  Caller must already know we hold at least one ref
2067  * on the caps in question or we don't know this is safe.
2068  */
2069 void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2070 {
2071         spin_lock(&ci->vfs_inode.i_lock);
2072         __take_cap_refs(ci, caps);
2073         spin_unlock(&ci->vfs_inode.i_lock);
2074 }
2075
2076 /*
2077  * Release cap refs.
2078  *
2079  * If we released the last ref on any given cap, call ceph_check_caps
2080  * to release (or schedule a release).
2081  *
2082  * If we are releasing a WR cap (from a sync write), finalize any affected
2083  * cap_snap, and wake up any waiters.
2084  */
2085 void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2086 {
2087         struct inode *inode = &ci->vfs_inode;
2088         int last = 0, put = 0, flushsnaps = 0, wake = 0;
2089         struct ceph_cap_snap *capsnap;
2090
2091         spin_lock(&inode->i_lock);
2092         if (had & CEPH_CAP_PIN)
2093                 --ci->i_pin_ref;
2094         if (had & CEPH_CAP_FILE_RD)
2095                 if (--ci->i_rd_ref == 0)
2096                         last++;
2097         if (had & CEPH_CAP_FILE_CACHE)
2098                 if (--ci->i_rdcache_ref == 0)
2099                         last++;
2100         if (had & CEPH_CAP_FILE_BUFFER) {
2101                 if (--ci->i_wrbuffer_ref == 0) {
2102                         last++;
2103                         put++;
2104                 }
2105                 dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
2106                      inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
2107         }
2108         if (had & CEPH_CAP_FILE_WR)
2109                 if (--ci->i_wr_ref == 0) {
2110                         last++;
2111                         if (!list_empty(&ci->i_cap_snaps)) {
2112                                 capsnap = list_first_entry(&ci->i_cap_snaps,
2113                                                      struct ceph_cap_snap,
2114                                                      ci_item);
2115                                 if (capsnap->writing) {
2116                                         capsnap->writing = 0;
2117                                         flushsnaps =
2118                                                 __ceph_finish_cap_snap(ci,
2119                                                                        capsnap);
2120                                         wake = 1;
2121                                 }
2122                         }
2123                 }
2124         spin_unlock(&inode->i_lock);
2125
2126         dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
2127              last ? " last" : "", put ? " put" : "");
2128
2129         if (last && !flushsnaps)
2130                 ceph_check_caps(ci, 0, NULL);
2131         else if (flushsnaps)
2132                 ceph_flush_snaps(ci);
2133         if (wake)
2134                 wake_up(&ci->i_cap_wq);
2135         if (put)
2136                 iput(inode);
2137 }
2138
2139 /*
2140  * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
2141  * context.  Adjust per-snap dirty page accounting as appropriate.
2142  * Once all dirty data for a cap_snap is flushed, flush snapped file
2143  * metadata back to the MDS.  If we dropped the last ref, call
2144  * ceph_check_caps.
2145  */
2146 void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2147                                 struct ceph_snap_context *snapc)
2148 {
2149         struct inode *inode = &ci->vfs_inode;
2150         int last = 0;
2151         int complete_capsnap = 0;
2152         int drop_capsnap = 0;
2153         int found = 0;
2154         struct ceph_cap_snap *capsnap = NULL;
2155
2156         spin_lock(&inode->i_lock);
2157         ci->i_wrbuffer_ref -= nr;
2158         last = !ci->i_wrbuffer_ref;
2159
2160         if (ci->i_head_snapc == snapc) {
2161                 ci->i_wrbuffer_ref_head -= nr;
2162                 if (!ci->i_wrbuffer_ref_head) {
2163                         ceph_put_snap_context(ci->i_head_snapc);
2164                         ci->i_head_snapc = NULL;
2165                 }
2166                 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
2167                      inode,
2168                      ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
2169                      ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
2170                      last ? " LAST" : "");
2171         } else {
2172                 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2173                         if (capsnap->context == snapc) {
2174                                 found = 1;
2175                                 break;
2176                         }
2177                 }
2178                 BUG_ON(!found);
2179                 capsnap->dirty_pages -= nr;
2180                 if (capsnap->dirty_pages == 0) {
2181                         complete_capsnap = 1;
2182                         if (capsnap->dirty == 0)
2183                                 /* cap writeback completed before we created
2184                                  * the cap_snap; no FLUSHSNAP is needed */
2185                                 drop_capsnap = 1;
2186                 }
2187                 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2188                      " snap %lld %d/%d -> %d/%d %s%s%s\n",
2189                      inode, capsnap, capsnap->context->seq,
2190                      ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2191                      ci->i_wrbuffer_ref, capsnap->dirty_pages,
2192                      last ? " (wrbuffer last)" : "",
2193                      complete_capsnap ? " (complete capsnap)" : "",
2194                      drop_capsnap ? " (drop capsnap)" : "");
2195                 if (drop_capsnap) {
2196                         ceph_put_snap_context(capsnap->context);
2197                         list_del(&capsnap->ci_item);
2198                         list_del(&capsnap->flushing_item);
2199                         ceph_put_cap_snap(capsnap);
2200                 }
2201         }
2202
2203         spin_unlock(&inode->i_lock);
2204
2205         if (last) {
2206                 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2207                 iput(inode);
2208         } else if (complete_capsnap) {
2209                 ceph_flush_snaps(ci);
2210                 wake_up(&ci->i_cap_wq);
2211         }
2212         if (drop_capsnap)
2213                 iput(inode);
2214 }
2215
2216 /*
2217  * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
2218  * actually be a revocation if it specifies a smaller cap set.)
2219  *
2220  * caller holds s_mutex and i_lock, we drop both.
2221  *
2222  * return value:
2223  *  0 - ok
2224  *  1 - check_caps on auth cap only (writeback)
2225  *  2 - check_caps (ack revoke)
2226  */
2227 static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2228                              struct ceph_mds_session *session,
2229                              struct ceph_cap *cap,
2230                              struct ceph_buffer *xattr_buf)
2231         __releases(inode->i_lock)
2232         __releases(session->s_mutex)
2233 {
2234         struct ceph_inode_info *ci = ceph_inode(inode);
2235         int mds = session->s_mds;
2236         int seq = le32_to_cpu(grant->seq);
2237         int newcaps = le32_to_cpu(grant->caps);
2238         int issued, implemented, used, wanted, dirty;
2239         u64 size = le64_to_cpu(grant->size);
2240         u64 max_size = le64_to_cpu(grant->max_size);
2241         struct timespec mtime, atime, ctime;
2242         int check_caps = 0;
2243         int wake = 0;
2244         int writeback = 0;
2245         int revoked_rdcache = 0;
2246         int queue_invalidate = 0;
2247
2248         dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2249              inode, cap, mds, seq, ceph_cap_string(newcaps));
2250         dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2251                 inode->i_size);
2252
2253         /*
2254          * If CACHE is being revoked, and we have no dirty buffers,
2255          * try to invalidate (once).  (If there are dirty buffers, we
2256          * will invalidate _after_ writeback.)
2257          */
2258         if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2259             !ci->i_wrbuffer_ref) {
2260                 if (try_nonblocking_invalidate(inode) == 0) {
2261                         revoked_rdcache = 1;
2262                 } else {
2263                         /* there were locked pages.. invalidate later
2264                            in a separate thread. */
2265                         if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2266                                 queue_invalidate = 1;
2267                                 ci->i_rdcache_revoking = ci->i_rdcache_gen;
2268                         }
2269                 }
2270         }
2271
2272         /* side effects now are allowed */
2273
2274         issued = __ceph_caps_issued(ci, &implemented);
2275         issued |= implemented | __ceph_caps_dirty(ci);
2276
2277         cap->cap_gen = session->s_cap_gen;
2278
2279         __check_cap_issue(ci, cap, newcaps);
2280
2281         if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
2282                 inode->i_mode = le32_to_cpu(grant->mode);
2283                 inode->i_uid = le32_to_cpu(grant->uid);
2284                 inode->i_gid = le32_to_cpu(grant->gid);
2285                 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
2286                      inode->i_uid, inode->i_gid);
2287         }
2288
2289         if ((issued & CEPH_CAP_LINK_EXCL) == 0)
2290                 inode->i_nlink = le32_to_cpu(grant->nlink);
2291
2292         if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2293                 int len = le32_to_cpu(grant->xattr_len);
2294                 u64 version = le64_to_cpu(grant->xattr_version);
2295
2296                 if (version > ci->i_xattrs.version) {
2297                         dout(" got new xattrs v%llu on %p len %d\n",
2298                              version, inode, len);
2299                         if (ci->i_xattrs.blob)
2300                                 ceph_buffer_put(ci->i_xattrs.blob);
2301                         ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
2302                         ci->i_xattrs.version = version;
2303                 }
2304         }
2305
2306         /* size/ctime/mtime/atime? */
2307         ceph_fill_file_size(inode, issued,
2308                             le32_to_cpu(grant->truncate_seq),
2309                             le64_to_cpu(grant->truncate_size), size);
2310         ceph_decode_timespec(&mtime, &grant->mtime);
2311         ceph_decode_timespec(&atime, &grant->atime);
2312         ceph_decode_timespec(&ctime, &grant->ctime);
2313         ceph_fill_file_time(inode, issued,
2314                             le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2315                             &atime);
2316
2317         /* max size increase? */
2318         if (max_size != ci->i_max_size) {
2319                 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2320                 ci->i_max_size = max_size;
2321                 if (max_size >= ci->i_wanted_max_size) {
2322                         ci->i_wanted_max_size = 0;  /* reset */
2323                         ci->i_requested_max_size = 0;
2324                 }
2325                 wake = 1;
2326         }
2327
2328         /* check cap bits */
2329         wanted = __ceph_caps_wanted(ci);
2330         used = __ceph_caps_used(ci);
2331         dirty = __ceph_caps_dirty(ci);
2332         dout(" my wanted = %s, used = %s, dirty %s\n",
2333              ceph_cap_string(wanted),
2334              ceph_cap_string(used),
2335              ceph_cap_string(dirty));
2336         if (wanted != le32_to_cpu(grant->wanted)) {
2337                 dout("mds wanted %s -> %s\n",
2338                      ceph_cap_string(le32_to_cpu(grant->wanted)),
2339                      ceph_cap_string(wanted));
2340                 grant->wanted = cpu_to_le32(wanted);
2341         }
2342
2343         cap->seq = seq;
2344
2345         /* file layout may have changed */
2346         ci->i_layout = grant->layout;
2347
2348         /* revocation, grant, or no-op? */
2349         if (cap->issued & ~newcaps) {
2350                 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
2351                      ceph_cap_string(newcaps));
2352                 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
2353                         writeback = 1; /* will delay ack */
2354                 else if (dirty & ~newcaps)
2355                         check_caps = 1;  /* initiate writeback in check_caps */
2356                 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
2357                            revoked_rdcache)
2358                         check_caps = 2;     /* send revoke ack in check_caps */
2359                 cap->issued = newcaps;
2360                 cap->implemented |= newcaps;
2361         } else if (cap->issued == newcaps) {
2362                 dout("caps unchanged: %s -> %s\n",
2363                      ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
2364         } else {
2365                 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2366                      ceph_cap_string(newcaps));
2367                 cap->issued = newcaps;
2368                 cap->implemented |= newcaps; /* add bits only, to
2369                                               * avoid stepping on a
2370                                               * pending revocation */
2371                 wake = 1;
2372         }
2373         BUG_ON(cap->issued & ~cap->implemented);
2374
2375         spin_unlock(&inode->i_lock);
2376         if (writeback)
2377                 /*
2378                  * queue inode for writeback: we can't actually call
2379                  * filemap_write_and_wait, etc. from message handler
2380                  * context.
2381                  */
2382                 ceph_queue_writeback(inode);
2383         if (queue_invalidate)
2384                 ceph_queue_invalidate(inode);
2385         if (wake)
2386                 wake_up(&ci->i_cap_wq);
2387
2388         if (check_caps == 1)
2389                 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
2390                                 session);
2391         else if (check_caps == 2)
2392                 ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
2393         else
2394                 mutex_unlock(&session->s_mutex);
2395 }
2396
2397 /*
2398  * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
2399  * MDS has been safely committed.
2400  */
2401 static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2402                                  struct ceph_mds_caps *m,
2403                                  struct ceph_mds_session *session,
2404                                  struct ceph_cap *cap)
2405         __releases(inode->i_lock)
2406 {
2407         struct ceph_inode_info *ci = ceph_inode(inode);
2408         struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
2409         unsigned seq = le32_to_cpu(m->seq);
2410         int dirty = le32_to_cpu(m->dirty);
2411         int cleaned = 0;
2412         int drop = 0;
2413         int i;
2414
2415         for (i = 0; i < CEPH_CAP_BITS; i++)
2416                 if ((dirty & (1 << i)) &&
2417                     flush_tid == ci->i_cap_flush_tid[i])
2418                         cleaned |= 1 << i;
2419
2420         dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
2421              " flushing %s -> %s\n",
2422              inode, session->s_mds, seq, ceph_cap_string(dirty),
2423              ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2424              ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2425
2426         if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
2427                 goto out;
2428
2429         ci->i_flushing_caps &= ~cleaned;
2430
2431         spin_lock(&mdsc->cap_dirty_lock);
2432         if (ci->i_flushing_caps == 0) {
2433                 list_del_init(&ci->i_flushing_item);
2434                 if (!list_empty(&session->s_cap_flushing))
2435                         dout(" mds%d still flushing cap on %p\n",
2436                              session->s_mds,
2437                              &list_entry(session->s_cap_flushing.next,
2438                                          struct ceph_inode_info,
2439                                          i_flushing_item)->vfs_inode);
2440                 mdsc->num_cap_flushing--;
2441                 wake_up(&mdsc->cap_flushing_wq);
2442                 dout(" inode %p now !flushing\n", inode);
2443
2444                 if (ci->i_dirty_caps == 0) {
2445                         dout(" inode %p now clean\n", inode);
2446                         BUG_ON(!list_empty(&ci->i_dirty_item));
2447                         drop = 1;
2448                 } else {
2449                         BUG_ON(list_empty(&ci->i_dirty_item));
2450                 }
2451         }
2452         spin_unlock(&mdsc->cap_dirty_lock);
2453         wake_up(&ci->i_cap_wq);
2454
2455 out:
2456         spin_unlock(&inode->i_lock);
2457         if (drop)
2458                 iput(inode);
2459 }
2460
2461 /*
2462  * Handle FLUSHSNAP_ACK.  MDS has flushed snap data to disk and we can
2463  * throw away our cap_snap.
2464  *
2465  * Caller hold s_mutex.
2466  */
2467 static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2468                                      struct ceph_mds_caps *m,
2469                                      struct ceph_mds_session *session)
2470 {
2471         struct ceph_inode_info *ci = ceph_inode(inode);
2472         u64 follows = le64_to_cpu(m->snap_follows);
2473         struct ceph_cap_snap *capsnap;
2474         int drop = 0;
2475
2476         dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
2477              inode, ci, session->s_mds, follows);
2478
2479         spin_lock(&inode->i_lock);
2480         list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2481                 if (capsnap->follows == follows) {
2482                         if (capsnap->flush_tid != flush_tid) {
2483                                 dout(" cap_snap %p follows %lld tid %lld !="
2484                                      " %lld\n", capsnap, follows,
2485                                      flush_tid, capsnap->flush_tid);
2486                                 break;
2487                         }
2488                         WARN_ON(capsnap->dirty_pages || capsnap->writing);
2489                         dout(" removing %p cap_snap %p follows %lld\n",
2490                              inode, capsnap, follows);
2491                         ceph_put_snap_context(capsnap->context);
2492                         list_del(&capsnap->ci_item);
2493                         list_del(&capsnap->flushing_item);
2494                         ceph_put_cap_snap(capsnap);
2495                         drop = 1;
2496                         break;
2497                 } else {
2498                         dout(" skipping cap_snap %p follows %lld\n",
2499                              capsnap, capsnap->follows);
2500                 }
2501         }
2502         spin_unlock(&inode->i_lock);
2503         if (drop)
2504                 iput(inode);
2505 }
2506
2507 /*
2508  * Handle TRUNC from MDS, indicating file truncation.
2509  *
2510  * caller hold s_mutex.
2511  */
2512 static void handle_cap_trunc(struct inode *inode,
2513                              struct ceph_mds_caps *trunc,
2514                              struct ceph_mds_session *session)
2515         __releases(inode->i_lock)
2516 {
2517         struct ceph_inode_info *ci = ceph_inode(inode);
2518         int mds = session->s_mds;
2519         int seq = le32_to_cpu(trunc->seq);
2520         u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
2521         u64 truncate_size = le64_to_cpu(trunc->truncate_size);
2522         u64 size = le64_to_cpu(trunc->size);
2523         int implemented = 0;
2524         int dirty = __ceph_caps_dirty(ci);
2525         int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
2526         int queue_trunc = 0;
2527
2528         issued |= implemented | dirty;
2529
2530         dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
2531              inode, mds, seq, truncate_size, truncate_seq);
2532         queue_trunc = ceph_fill_file_size(inode, issued,
2533                                           truncate_seq, truncate_size, size);
2534         spin_unlock(&inode->i_lock);
2535
2536         if (queue_trunc)
2537                 ceph_queue_vmtruncate(inode);
2538 }
2539
2540 /*
2541  * Handle EXPORT from MDS.  Cap is being migrated _from_ this mds to a
2542  * different one.  If we are the most recent migration we've seen (as
2543  * indicated by mseq), make note of the migrating cap bits for the
2544  * duration (until we see the corresponding IMPORT).
2545  *
2546  * caller holds s_mutex
2547  */
2548 static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2549                               struct ceph_mds_session *session)
2550 {
2551         struct ceph_inode_info *ci = ceph_inode(inode);
2552         int mds = session->s_mds;
2553         unsigned mseq = le32_to_cpu(ex->migrate_seq);
2554         struct ceph_cap *cap = NULL, *t;
2555         struct rb_node *p;
2556         int remember = 1;
2557
2558         dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
2559              inode, ci, mds, mseq);
2560
2561         spin_lock(&inode->i_lock);
2562
2563         /* make sure we haven't seen a higher mseq */
2564         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2565                 t = rb_entry(p, struct ceph_cap, ci_node);
2566                 if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2567                         dout(" higher mseq on cap from mds%d\n",
2568                              t->session->s_mds);
2569                         remember = 0;
2570                 }
2571                 if (t->session->s_mds == mds)
2572                         cap = t;
2573         }
2574
2575         if (cap) {
2576                 if (remember) {
2577                         /* make note */
2578                         ci->i_cap_exporting_mds = mds;
2579                         ci->i_cap_exporting_mseq = mseq;
2580                         ci->i_cap_exporting_issued = cap->issued;
2581                 }
2582                 __ceph_remove_cap(cap);
2583         }
2584         /* else, we already released it */
2585
2586         spin_unlock(&inode->i_lock);
2587 }
2588
2589 /*
2590  * Handle cap IMPORT.  If there are temp bits from an older EXPORT,
2591  * clean them up.
2592  *
2593  * caller holds s_mutex.
2594  */
2595 static void handle_cap_import(struct ceph_mds_client *mdsc,
2596                               struct inode *inode, struct ceph_mds_caps *im,
2597                               struct ceph_mds_session *session,
2598                               void *snaptrace, int snaptrace_len)
2599 {
2600         struct ceph_inode_info *ci = ceph_inode(inode);
2601         int mds = session->s_mds;
2602         unsigned issued = le32_to_cpu(im->caps);
2603         unsigned wanted = le32_to_cpu(im->wanted);
2604         unsigned seq = le32_to_cpu(im->seq);
2605         unsigned mseq = le32_to_cpu(im->migrate_seq);
2606         u64 realmino = le64_to_cpu(im->realm);
2607         u64 cap_id = le64_to_cpu(im->cap_id);
2608
2609         if (ci->i_cap_exporting_mds >= 0 &&
2610             ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
2611                 dout("handle_cap_import inode %p ci %p mds%d mseq %d"
2612                      " - cleared exporting from mds%d\n",
2613                      inode, ci, mds, mseq,
2614                      ci->i_cap_exporting_mds);
2615                 ci->i_cap_exporting_issued = 0;
2616                 ci->i_cap_exporting_mseq = 0;
2617                 ci->i_cap_exporting_mds = -1;
2618         } else {
2619                 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2620                      inode, ci, mds, mseq);
2621         }
2622
2623         down_write(&mdsc->snap_rwsem);
2624         ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2625                                false);
2626         downgrade_write(&mdsc->snap_rwsem);
2627         ceph_add_cap(inode, session, cap_id, -1,
2628                      issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2629                      NULL /* no caps context */);
2630         try_flush_caps(inode, session, NULL);
2631         up_read(&mdsc->snap_rwsem);
2632 }
2633
2634 /*
2635  * Handle a caps message from the MDS.
2636  *
2637  * Identify the appropriate session, inode, and call the right handler
2638  * based on the cap op.
2639  */
2640 void ceph_handle_caps(struct ceph_mds_session *session,
2641                       struct ceph_msg *msg)
2642 {
2643         struct ceph_mds_client *mdsc = session->s_mdsc;
2644         struct super_block *sb = mdsc->client->sb;
2645         struct inode *inode;
2646         struct ceph_cap *cap;
2647         struct ceph_mds_caps *h;
2648         int mds = session->s_mds;
2649         int op;
2650         u32 seq;
2651         struct ceph_vino vino;
2652         u64 cap_id;
2653         u64 size, max_size;
2654         u64 tid;
2655         void *snaptrace;
2656
2657         dout("handle_caps from mds%d\n", mds);
2658
2659         /* decode */
2660         tid = le64_to_cpu(msg->hdr.tid);
2661         if (msg->front.iov_len < sizeof(*h))
2662                 goto bad;
2663         h = msg->front.iov_base;
2664         snaptrace = h + 1;
2665         op = le32_to_cpu(h->op);
2666         vino.ino = le64_to_cpu(h->ino);
2667         vino.snap = CEPH_NOSNAP;
2668         cap_id = le64_to_cpu(h->cap_id);
2669         seq = le32_to_cpu(h->seq);
2670         size = le64_to_cpu(h->size);
2671         max_size = le64_to_cpu(h->max_size);
2672
2673         mutex_lock(&session->s_mutex);
2674         session->s_seq++;
2675         dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2676              (unsigned)seq);
2677
2678         /* lookup ino */
2679         inode = ceph_find_inode(sb, vino);
2680         dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
2681              vino.snap, inode);
2682         if (!inode) {
2683                 dout(" i don't have ino %llx\n", vino.ino);
2684                 goto done;
2685         }
2686
2687         /* these will work even if we don't have a cap yet */
2688         switch (op) {
2689         case CEPH_CAP_OP_FLUSHSNAP_ACK:
2690                 handle_cap_flushsnap_ack(inode, tid, h, session);
2691                 goto done;
2692
2693         case CEPH_CAP_OP_EXPORT:
2694                 handle_cap_export(inode, h, session);
2695                 goto done;
2696
2697         case CEPH_CAP_OP_IMPORT:
2698                 handle_cap_import(mdsc, inode, h, session,
2699                                   snaptrace, le32_to_cpu(h->snap_trace_len));
2700                 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
2701                                 session);
2702                 goto done_unlocked;
2703         }
2704
2705         /* the rest require a cap */
2706         spin_lock(&inode->i_lock);
2707         cap = __get_cap_for_mds(ceph_inode(inode), mds);
2708         if (!cap) {
2709                 dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
2710                      inode, ceph_ino(inode), ceph_snap(inode), mds);
2711                 spin_unlock(&inode->i_lock);
2712                 goto done;
2713         }
2714
2715         /* note that each of these drops i_lock for us */
2716         switch (op) {
2717         case CEPH_CAP_OP_REVOKE:
2718         case CEPH_CAP_OP_GRANT:
2719                 handle_cap_grant(inode, h, session, cap, msg->middle);
2720                 goto done_unlocked;
2721
2722         case CEPH_CAP_OP_FLUSH_ACK:
2723                 handle_cap_flush_ack(inode, tid, h, session, cap);
2724                 break;
2725
2726         case CEPH_CAP_OP_TRUNC:
2727                 handle_cap_trunc(inode, h, session);
2728                 break;
2729
2730         default:
2731                 spin_unlock(&inode->i_lock);
2732                 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
2733                        ceph_cap_op_name(op));
2734         }
2735
2736 done:
2737         mutex_unlock(&session->s_mutex);
2738 done_unlocked:
2739         if (inode)
2740                 iput(inode);
2741         return;
2742
2743 bad:
2744         pr_err("ceph_handle_caps: corrupt message\n");
2745         ceph_msg_dump(msg);
2746         return;
2747 }
2748
2749 /*
2750  * Delayed work handler to process end of delayed cap release LRU list.
2751  */
2752 void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2753 {
2754         struct ceph_inode_info *ci;
2755         int flags = CHECK_CAPS_NODELAY;
2756
2757         dout("check_delayed_caps\n");
2758         while (1) {
2759                 spin_lock(&mdsc->cap_delay_lock);
2760                 if (list_empty(&mdsc->cap_delay_list))
2761                         break;
2762                 ci = list_first_entry(&mdsc->cap_delay_list,
2763                                       struct ceph_inode_info,
2764                                       i_cap_delay_list);
2765                 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
2766                     time_before(jiffies, ci->i_hold_caps_max))
2767                         break;
2768                 list_del_init(&ci->i_cap_delay_list);
2769                 spin_unlock(&mdsc->cap_delay_lock);
2770                 dout("check_delayed_caps on %p\n", &ci->vfs_inode);
2771                 ceph_check_caps(ci, flags, NULL);
2772         }
2773         spin_unlock(&mdsc->cap_delay_lock);
2774 }
2775
2776 /*
2777  * Flush all dirty caps to the mds
2778  */
2779 void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2780 {
2781         struct ceph_inode_info *ci, *nci = NULL;
2782         struct inode *inode, *ninode = NULL;
2783         struct list_head *p, *n;
2784
2785         dout("flush_dirty_caps\n");
2786         spin_lock(&mdsc->cap_dirty_lock);
2787         list_for_each_safe(p, n, &mdsc->cap_dirty) {
2788                 if (nci) {
2789                         ci = nci;
2790                         inode = ninode;
2791                         ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
2792                         dout("flush_dirty_caps inode %p (was next inode)\n",
2793                              inode);
2794                 } else {
2795                         ci = list_entry(p, struct ceph_inode_info,
2796                                         i_dirty_item);
2797                         inode = igrab(&ci->vfs_inode);
2798                         BUG_ON(!inode);
2799                         dout("flush_dirty_caps inode %p\n", inode);
2800                 }
2801                 if (n != &mdsc->cap_dirty) {
2802                         nci = list_entry(n, struct ceph_inode_info,
2803                                          i_dirty_item);
2804                         ninode = igrab(&nci->vfs_inode);
2805                         BUG_ON(!ninode);
2806                         nci->i_ceph_flags |= CEPH_I_NOFLUSH;
2807                         dout("flush_dirty_caps next inode %p, noflush\n",
2808                              ninode);
2809                 } else {
2810                         nci = NULL;
2811                         ninode = NULL;
2812                 }
2813                 spin_unlock(&mdsc->cap_dirty_lock);
2814                 if (inode) {
2815                         ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
2816                                         NULL);
2817                         iput(inode);
2818                 }
2819                 spin_lock(&mdsc->cap_dirty_lock);
2820         }
2821         spin_unlock(&mdsc->cap_dirty_lock);
2822 }
2823
2824 /*
2825  * Drop open file reference.  If we were the last open file,
2826  * we may need to release capabilities to the MDS (or schedule
2827  * their delayed release).
2828  */
2829 void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
2830 {
2831         struct inode *inode = &ci->vfs_inode;
2832         int last = 0;
2833
2834         spin_lock(&inode->i_lock);
2835         dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
2836              ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
2837         BUG_ON(ci->i_nr_by_mode[fmode] == 0);
2838         if (--ci->i_nr_by_mode[fmode] == 0)
2839                 last++;
2840         spin_unlock(&inode->i_lock);
2841
2842         if (last && ci->i_vino.snap == CEPH_NOSNAP)
2843                 ceph_check_caps(ci, 0, NULL);
2844 }
2845
2846 /*
2847  * Helpers for embedding cap and dentry lease releases into mds
2848  * requests.
2849  *
2850  * @force is used by dentry_release (below) to force inclusion of a
2851  * record for the directory inode, even when there aren't any caps to
2852  * drop.
2853  */
2854 int ceph_encode_inode_release(void **p, struct inode *inode,
2855                               int mds, int drop, int unless, int force)
2856 {
2857         struct ceph_inode_info *ci = ceph_inode(inode);
2858         struct ceph_cap *cap;
2859         struct ceph_mds_request_release *rel = *p;
2860         int ret = 0;
2861         int used = 0;
2862
2863         spin_lock(&inode->i_lock);
2864         used = __ceph_caps_used(ci);
2865
2866         dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
2867              mds, ceph_cap_string(used), ceph_cap_string(drop),
2868              ceph_cap_string(unless));
2869
2870         /* only drop unused caps */
2871         drop &= ~used;
2872
2873         cap = __get_cap_for_mds(ci, mds);
2874         if (cap && __cap_is_valid(cap)) {
2875                 if (force ||
2876                     ((cap->issued & drop) &&
2877                      (cap->issued & unless) == 0)) {
2878                         if ((cap->issued & drop) &&
2879                             (cap->issued & unless) == 0) {
2880                                 dout("encode_inode_release %p cap %p %s -> "
2881                                      "%s\n", inode, cap,
2882                                      ceph_cap_string(cap->issued),
2883                                      ceph_cap_string(cap->issued & ~drop));
2884                                 cap->issued &= ~drop;
2885                                 cap->implemented &= ~drop;
2886                                 if (ci->i_ceph_flags & CEPH_I_NODELAY) {
2887                                         int wanted = __ceph_caps_wanted(ci);
2888                                         dout("  wanted %s -> %s (act %s)\n",
2889                                              ceph_cap_string(cap->mds_wanted),
2890                                              ceph_cap_string(cap->mds_wanted &
2891                                                              ~wanted),
2892                                              ceph_cap_string(wanted));
2893                                         cap->mds_wanted &= wanted;
2894                                 }
2895                         } else {
2896                                 dout("encode_inode_release %p cap %p %s"
2897                                      " (force)\n", inode, cap,
2898                                      ceph_cap_string(cap->issued));
2899                         }
2900
2901                         rel->ino = cpu_to_le64(ceph_ino(inode));
2902                         rel->cap_id = cpu_to_le64(cap->cap_id);
2903                         rel->seq = cpu_to_le32(cap->seq);
2904                         rel->issue_seq = cpu_to_le32(cap->issue_seq),
2905                         rel->mseq = cpu_to_le32(cap->mseq);
2906                         rel->caps = cpu_to_le32(cap->issued);
2907                         rel->wanted = cpu_to_le32(cap->mds_wanted);
2908                         rel->dname_len = 0;
2909                         rel->dname_seq = 0;
2910                         *p += sizeof(*rel);
2911                         ret = 1;
2912                 } else {
2913                         dout("encode_inode_release %p cap %p %s\n",
2914                              inode, cap, ceph_cap_string(cap->issued));
2915                 }
2916         }
2917         spin_unlock(&inode->i_lock);
2918         return ret;
2919 }
2920
2921 int ceph_encode_dentry_release(void **p, struct dentry *dentry,
2922                                int mds, int drop, int unless)
2923 {
2924         struct inode *dir = dentry->d_parent->d_inode;
2925         struct ceph_mds_request_release *rel = *p;
2926         struct ceph_dentry_info *di = ceph_dentry(dentry);
2927         int force = 0;
2928         int ret;
2929
2930         /*
2931          * force an record for the directory caps if we have a dentry lease.
2932          * this is racy (can't take i_lock and d_lock together), but it
2933          * doesn't have to be perfect; the mds will revoke anything we don't
2934          * release.
2935          */
2936         spin_lock(&dentry->d_lock);
2937         if (di->lease_session && di->lease_session->s_mds == mds)
2938                 force = 1;
2939         spin_unlock(&dentry->d_lock);
2940
2941         ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
2942
2943         spin_lock(&dentry->d_lock);
2944         if (ret && di->lease_session && di->lease_session->s_mds == mds) {
2945                 dout("encode_dentry_release %p mds%d seq %d\n",
2946                      dentry, mds, (int)di->lease_seq);
2947                 rel->dname_len = cpu_to_le32(dentry->d_name.len);
2948                 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
2949                 *p += dentry->d_name.len;
2950                 rel->dname_seq = cpu_to_le32(di->lease_seq);
2951         }
2952         spin_unlock(&dentry->d_lock);
2953         return ret;
2954 }