fs/xfs/xfs_log_cil.c

   1 /*
   2  * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public License as
   6  * published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it would be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11  * GNU General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public License
  14  * along with this program; if not, write the Free Software Foundation,
  15  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  16  */
  17
  18 #include "xfs.h"
  19 #include "xfs_fs.h"
  20 #include "xfs_types.h"
  21 #include "xfs_bit.h"
  22 #include "xfs_log.h"
  23 #include "xfs_inum.h"
  24 #include "xfs_trans.h"
  25 #include "xfs_trans_priv.h"
  26 #include "xfs_log_priv.h"
  27 #include "xfs_sb.h"
  28 #include "xfs_ag.h"
  29 #include "xfs_mount.h"
  30 #include "xfs_error.h"
  31 #include "xfs_alloc.h"
  32 #include "xfs_discard.h"
  33
  34 /*
  35  * Allocate a new ticket. Failing to get a new ticket makes it really hard to
  36  * recover, so we don't allow failure here. Also, we allocate in a context that
  37  * we don't want to be issuing transactions from, so we need to tell the
  38  * allocation code this as well.
  39  *
  40  * We don't reserve any space for the ticket - we are going to steal whatever
  41  * space we require from transactions as they commit. To ensure we reserve all
  42  * the space required, we need to set the current reservation of the ticket to
  43  * zero so that we know to steal the initial transaction overhead from the
  44  * first transaction commit.
  45  */
  46 static struct xlog_ticket *
  47 xlog_cil_ticket_alloc(
  48         struct log      *log)
  49 {
  50         struct xlog_ticket *tic;
  51
  52         tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
  53                                 KM_SLEEP|KM_NOFS);
  54         tic->t_trans_type = XFS_TRANS_CHECKPOINT;
  55
  56         /*
  57          * set the current reservation to zero so we know to steal the basic
  58          * transaction overhead reservation from the first transaction commit.
  59          */
  60         tic->t_curr_res = 0;
  61         return tic;
  62 }
  63
  64 /*
  65  * After the first stage of log recovery is done, we know where the head and
  66  * tail of the log are. We need this log initialisation done before we can
  67  * initialise the first CIL checkpoint context.
  68  *
  69  * Here we allocate a log ticket to track space usage during a CIL push.  This
  70  * ticket is passed to xlog_write() directly so that we don't slowly leak log
  71  * space by failing to account for space used by log headers and additional
  72  * region headers for split regions.
  73  */
  74 void
  75 xlog_cil_init_post_recovery(
  76         struct log      *log)
  77 {
  78         log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
  79         log->l_cilp->xc_ctx->sequence = 1;
  80         log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
  81                                                                 log->l_curr_block);
  82 }
  83
  84 /*
  85  * Format log item into a flat buffers
  86  *
  87  * For delayed logging, we need to hold a formatted buffer containing all the
  88  * changes on the log item. This enables us to relog the item in memory and
  89  * write it out asynchronously without needing to relock the object that was
  90  * modified at the time it gets written into the iclog.
  91  *
  92  * This function builds a vector for the changes in each log item in the
  93  * transaction. It then works out the length of the buffer needed for each log
  94  * item, allocates them and formats the vector for the item into the buffer.
  95  * The buffer is then attached to the log item are then inserted into the
  96  * Committed Item List for tracking until the next checkpoint is written out.
  97  *
  98  * We don't set up region headers during this process; we simply copy the
  99  * regions into the flat buffer. We can do this because we still have to do a
 100  * formatting step to write the regions into the iclog buffer.  Writing the
 101  * ophdrs during the iclog write means that we can support splitting large
 102  * regions across iclog boundares without needing a change in the format of the
 103  * item/region encapsulation.
 104  *
 105  * Hence what we need to do now is change the rewrite the vector array to point
 106  * to the copied region inside the buffer we just allocated. This allows us to
 107  * format the regions into the iclog as though they are being formatted
 108  * directly out of the objects themselves.
 109  */
 110 static struct xfs_log_vec *
 111 xlog_cil_prepare_log_vecs(
 112         struct xfs_trans        *tp)
 113 {
 114         struct xfs_log_item_desc *lidp;
 115         struct xfs_log_vec      *lv = NULL;
 116         struct xfs_log_vec      *ret_lv = NULL;
 117
 118
 119         /* Bail out if we didn't find a log item.  */
 120         if (list_empty(&tp->t_items)) {
 121                 ASSERT(0);
 122                 return NULL;
 123         }
 124
 125         list_for_each_entry(lidp, &tp->t_items, lid_trans) {
 126                 struct xfs_log_vec *new_lv;
 127                 void    *ptr;
 128                 int     index;
 129                 int     len = 0;
 130                 uint    niovecs;
 131
 132                 /* Skip items which aren't dirty in this transaction. */
 133                 if (!(lidp->lid_flags & XFS_LID_DIRTY))
 134                         continue;
 135
 136                 /* Skip items that do not have any vectors for writing */
 137                 niovecs = IOP_SIZE(lidp->lid_item);
 138                 if (!niovecs)
 139                         continue;
 140
 141                 new_lv = kmem_zalloc(sizeof(*new_lv) +
 142                                 niovecs * sizeof(struct xfs_log_iovec),
 143                                 KM_SLEEP);
 144
 145                 /* The allocated iovec region lies beyond the log vector. */
 146                 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
 147                 new_lv->lv_niovecs = niovecs;
 148                 new_lv->lv_item = lidp->lid_item;
 149
 150                 /* build the vector array and calculate it's length */
 151                 IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
 152                 for (index = 0; index < new_lv->lv_niovecs; index++)
 153                         len += new_lv->lv_iovecp[index].i_len;
 154
 155                 new_lv->lv_buf_len = len;
 156                 new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len,
 157                                 KM_SLEEP|KM_NOFS);
 158                 ptr = new_lv->lv_buf;
 159
 160                 for (index = 0; index < new_lv->lv_niovecs; index++) {
 161                         struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index];
 162
 163                         memcpy(ptr, vec->i_addr, vec->i_len);
 164                         vec->i_addr = ptr;
 165                         ptr += vec->i_len;
 166                 }
 167                 ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
 168
 169                 if (!ret_lv)
 170                         ret_lv = new_lv;
 171                 else
 172                         lv->lv_next = new_lv;
 173                 lv = new_lv;
 174         }
 175
 176         return ret_lv;
 177 }
 178
 179 /*
 180  * Prepare the log item for insertion into the CIL. Calculate the difference in
 181  * log space and vectors it will consume, and if it is a new item pin it as
 182  * well.
 183  */
 184 STATIC void
 185 xfs_cil_prepare_item(
 186         struct log              *log,
 187         struct xfs_log_vec      *lv,
 188         int                     *len,
 189         int                     *diff_iovecs)
 190 {
 191         struct xfs_log_vec      *old = lv->lv_item->li_lv;
 192
 193         if (old) {
 194                 /* existing lv on log item, space used is a delta */
 195                 ASSERT(!list_empty(&lv->lv_item->li_cil));
 196                 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
 197
 198                 *len += lv->lv_buf_len - old->lv_buf_len;
 199                 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
 200                 kmem_free(old->lv_buf);
 201                 kmem_free(old);
 202         } else {
 203                 /* new lv, must pin the log item */
 204                 ASSERT(!lv->lv_item->li_lv);
 205                 ASSERT(list_empty(&lv->lv_item->li_cil));
 206
 207                 *len += lv->lv_buf_len;
 208                 *diff_iovecs += lv->lv_niovecs;
 209                 IOP_PIN(lv->lv_item);
 210
 211         }
 212
 213         /* attach new log vector to log item */
 214         lv->lv_item->li_lv = lv;
 215
 216         /*
 217          * If this is the first time the item is being committed to the
 218          * CIL, store the sequence number on the log item so we can
 219          * tell in future commits whether this is the first checkpoint
 220          * the item is being committed into.
 221          */
 222         if (!lv->lv_item->li_seq)
 223                 lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
 224 }
 225
 226 /*
 227  * Insert the log items into the CIL and calculate the difference in space
 228  * consumed by the item. Add the space to the checkpoint ticket and calculate
 229  * if the change requires additional log metadata. If it does, take that space
 230  * as well. Remove the amount of space we added to the checkpoint ticket from
 231  * the current transaction ticket so that the accounting works out correctly.
 232  */
 233 static void
 234 xlog_cil_insert_items(
 235         struct log              *log,
 236         struct xfs_log_vec      *log_vector,
 237         struct xlog_ticket      *ticket)
 238 {
 239         struct xfs_cil          *cil = log->l_cilp;
 240         struct xfs_cil_ctx      *ctx = cil->xc_ctx;
 241         struct xfs_log_vec      *lv;
 242         int                     len = 0;
 243         int                     diff_iovecs = 0;
 244         int                     iclog_space;
 245
 246         ASSERT(log_vector);
 247
 248         /*
 249          * Do all the accounting aggregation and switching of log vectors
 250          * around in a separate loop to the insertion of items into the CIL.
 251          * Then we can do a separate loop to update the CIL within a single
 252          * lock/unlock pair. This reduces the number of round trips on the CIL
 253          * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
 254          * hold time for the transaction commit.
 255          *
 256          * If this is the first time the item is being placed into the CIL in
 257          * this context, pin it so it can't be written to disk until the CIL is
 258          * flushed to the iclog and the iclog written to disk.
 259          *
 260          * We can do this safely because the context can't checkpoint until we
 261          * are done so it doesn't matter exactly how we update the CIL.
 262          */
 263         for (lv = log_vector; lv; lv = lv->lv_next)
 264                 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
 265
 266         /* account for space used by new iovec headers  */
 267         len += diff_iovecs * sizeof(xlog_op_header_t);
 268
 269         spin_lock(&cil->xc_cil_lock);
 270
 271         /* move the items to the tail of the CIL */
 272         for (lv = log_vector; lv; lv = lv->lv_next)
 273                 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
 274
 275         ctx->nvecs += diff_iovecs;
 276
 277         /*
 278          * Now transfer enough transaction reservation to the context ticket
 279          * for the checkpoint. The context ticket is special - the unit
 280          * reservation has to grow as well as the current reservation as we
 281          * steal from tickets so we can correctly determine the space used
 282          * during the transaction commit.
 283          */
 284         if (ctx->ticket->t_curr_res == 0) {
 285                 /* first commit in checkpoint, steal the header reservation */
 286                 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
 287                 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
 288                 ticket->t_curr_res -= ctx->ticket->t_unit_res;
 289         }
 290
 291         /* do we need space for more log record headers? */
 292         iclog_space = log->l_iclog_size - log->l_iclog_hsize;
 293         if (len > 0 && (ctx->space_used / iclog_space !=
 294                                 (ctx->space_used + len) / iclog_space)) {
 295                 int hdrs;
 296
 297                 hdrs = (len + iclog_space - 1) / iclog_space;
 298                 /* need to take into account split region headers, too */
 299                 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
 300                 ctx->ticket->t_unit_res += hdrs;
 301                 ctx->ticket->t_curr_res += hdrs;
 302                 ticket->t_curr_res -= hdrs;
 303                 ASSERT(ticket->t_curr_res >= len);
 304         }
 305         ticket->t_curr_res -= len;
 306         ctx->space_used += len;
 307
 308         spin_unlock(&cil->xc_cil_lock);
 309 }
 310
 311 static void
 312 xlog_cil_free_logvec(
 313         struct xfs_log_vec      *log_vector)
 314 {
 315         struct xfs_log_vec      *lv;
 316
 317         for (lv = log_vector; lv; ) {
 318                 struct xfs_log_vec *next = lv->lv_next;
 319                 kmem_free(lv->lv_buf);
 320                 kmem_free(lv);
 321                 lv = next;
 322         }
 323 }
 324
 325 /*
 326  * Mark all items committed and clear busy extents. We free the log vector
 327  * chains in a separate pass so that we unpin the log items as quickly as
 328  * possible.
 329  */
 330 static void
 331 xlog_cil_committed(
 332         void    *args,
 333         int     abort)
 334 {
 335         struct xfs_cil_ctx      *ctx = args;
 336         struct xfs_mount        *mp = ctx->cil->xc_log->l_mp;
 337
 338         xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
 339                                         ctx->start_lsn, abort);
 340
 341         xfs_alloc_busy_sort(&ctx->busy_extents);
 342         xfs_alloc_busy_clear(mp, &ctx->busy_extents,
 343                              (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
 344
 345         spin_lock(&ctx->cil->xc_cil_lock);
 346         list_del(&ctx->committing);
 347         spin_unlock(&ctx->cil->xc_cil_lock);
 348
 349         xlog_cil_free_logvec(ctx->lv_chain);
 350
 351         if (!list_empty(&ctx->busy_extents)) {
 352                 ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
 353
 354                 xfs_discard_extents(mp, &ctx->busy_extents);
 355                 xfs_alloc_busy_clear(mp, &ctx->busy_extents, false);
 356         }
 357
 358         kmem_free(ctx);
 359 }
 360
 361 /*
 362  * Push the Committed Item List to the log. If @push_seq flag is zero, then it
 363  * is a background flush and so we can chose to ignore it. Otherwise, if the
 364  * current sequence is the same as @push_seq we need to do a flush. If
 365  * @push_seq is less than the current sequence, then it has already been
 366  * flushed and we don't need to do anything - the caller will wait for it to
 367  * complete if necessary.
 368  *
 369  * @push_seq is a value rather than a flag because that allows us to do an
 370  * unlocked check of the sequence number for a match. Hence we can allows log
 371  * forces to run racily and not issue pushes for the same sequence twice. If we
 372  * get a race between multiple pushes for the same sequence they will block on
 373  * the first one and then abort, hence avoiding needless pushes.
 374  */
 375 STATIC int
 376 xlog_cil_push(
 377         struct log              *log)
 378 {
 379         struct xfs_cil          *cil = log->l_cilp;
 380         struct xfs_log_vec      *lv;
 381         struct xfs_cil_ctx      *ctx;
 382         struct xfs_cil_ctx      *new_ctx;
 383         struct xlog_in_core     *commit_iclog;
 384         struct xlog_ticket      *tic;
 385         int                     num_lv;
 386         int                     num_iovecs;
 387         int                     len;
 388         int                     error = 0;
 389         struct xfs_trans_header thdr;
 390         struct xfs_log_iovec    lhdr;
 391         struct xfs_log_vec      lvhdr = { NULL };
 392         xfs_lsn_t               commit_lsn;
 393         xfs_lsn_t               push_seq;
 394
 395         if (!cil)
 396                 return 0;
 397
 398         new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
 399         new_ctx->ticket = xlog_cil_ticket_alloc(log);
 400
 401         down_write(&cil->xc_ctx_lock);
 402         ctx = cil->xc_ctx;
 403
 404         spin_lock(&cil->xc_cil_lock);
 405         push_seq = cil->xc_push_seq;
 406         ASSERT(push_seq <= ctx->sequence);
 407
 408         /*
 409          * Check if we've anything to push. If there is nothing, then we don't
 410          * move on to a new sequence number and so we have to be able to push
 411          * this sequence again later.
 412          */
 413         if (list_empty(&cil->xc_cil)) {
 414                 cil->xc_push_seq = 0;
 415                 spin_unlock(&cil->xc_cil_lock);
 416                 goto out_skip;
 417         }
 418         spin_unlock(&cil->xc_cil_lock);
 419
 420
 421         /* check for a previously pushed seqeunce */
 422         if (push_seq < cil->xc_ctx->sequence)
 423                 goto out_skip;
 424
 425         /*
 426          * pull all the log vectors off the items in the CIL, and
 427          * remove the items from the CIL. We don't need the CIL lock
 428          * here because it's only needed on the transaction commit
 429          * side which is currently locked out by the flush lock.
 430          */
 431         lv = NULL;
 432         num_lv = 0;
 433         num_iovecs = 0;
 434         len = 0;
 435         while (!list_empty(&cil->xc_cil)) {
 436                 struct xfs_log_item     *item;
 437                 int                     i;
 438
 439                 item = list_first_entry(&cil->xc_cil,
 440                                         struct xfs_log_item, li_cil);
 441                 list_del_init(&item->li_cil);
 442                 if (!ctx->lv_chain)
 443                         ctx->lv_chain = item->li_lv;
 444                 else
 445                         lv->lv_next = item->li_lv;
 446                 lv = item->li_lv;
 447                 item->li_lv = NULL;
 448
 449                 num_lv++;
 450                 num_iovecs += lv->lv_niovecs;
 451                 for (i = 0; i < lv->lv_niovecs; i++)
 452                         len += lv->lv_iovecp[i].i_len;
 453         }
 454
 455         /*
 456          * initialise the new context and attach it to the CIL. Then attach
 457          * the current context to the CIL committing lsit so it can be found
 458          * during log forces to extract the commit lsn of the sequence that
 459          * needs to be forced.
 460          */
 461         INIT_LIST_HEAD(&new_ctx->committing);
 462         INIT_LIST_HEAD(&new_ctx->busy_extents);
 463         new_ctx->sequence = ctx->sequence + 1;
 464         new_ctx->cil = cil;
 465         cil->xc_ctx = new_ctx;
 466
 467         /*
 468          * mirror the new sequence into the cil structure so that we can do
 469          * unlocked checks against the current sequence in log forces without
 470          * risking deferencing a freed context pointer.
 471          */
 472         cil->xc_current_sequence = new_ctx->sequence;
 473
 474         /*
 475          * The switch is now done, so we can drop the context lock and move out
 476          * of a shared context. We can't just go straight to the commit record,
 477          * though - we need to synchronise with previous and future commits so
 478          * that the commit records are correctly ordered in the log to ensure
 479          * that we process items during log IO completion in the correct order.
 480          *
 481          * For example, if we get an EFI in one checkpoint and the EFD in the
 482          * next (e.g. due to log forces), we do not want the checkpoint with
 483          * the EFD to be committed before the checkpoint with the EFI.  Hence
 484          * we must strictly order the commit records of the checkpoints so
 485          * that: a) the checkpoint callbacks are attached to the iclogs in the
 486          * correct order; and b) the checkpoints are replayed in correct order
 487          * in log recovery.
 488          *
 489          * Hence we need to add this context to the committing context list so
 490          * that higher sequences will wait for us to write out a commit record
 491          * before they do.
 492          */
 493         spin_lock(&cil->xc_cil_lock);
 494         list_add(&ctx->committing, &cil->xc_committing);
 495         spin_unlock(&cil->xc_cil_lock);
 496         up_write(&cil->xc_ctx_lock);
 497
 498         /*
 499          * Build a checkpoint transaction header and write it to the log to
 500          * begin the transaction. We need to account for the space used by the
 501          * transaction header here as it is not accounted for in xlog_write().
 502          *
 503          * The LSN we need to pass to the log items on transaction commit is
 504          * the LSN reported by the first log vector write. If we use the commit
 505          * record lsn then we can move the tail beyond the grant write head.
 506          */
 507         tic = ctx->ticket;
 508         thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
 509         thdr.th_type = XFS_TRANS_CHECKPOINT;
 510         thdr.th_tid = tic->t_tid;
 511         thdr.th_num_items = num_iovecs;
 512         lhdr.i_addr = &thdr;
 513         lhdr.i_len = sizeof(xfs_trans_header_t);
 514         lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
 515         tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
 516
 517         lvhdr.lv_niovecs = 1;
 518         lvhdr.lv_iovecp = &lhdr;
 519         lvhdr.lv_next = ctx->lv_chain;
 520
 521         error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
 522         if (error)
 523                 goto out_abort_free_ticket;
 524
 525         /*
 526          * now that we've written the checkpoint into the log, strictly
 527          * order the commit records so replay will get them in the right order.
 528          */
 529 restart:
 530         spin_lock(&cil->xc_cil_lock);
 531         list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
 532                 /*
 533                  * Higher sequences will wait for this one so skip them.
 534                  * Don't wait for own own sequence, either.
 535                  */
 536                 if (new_ctx->sequence >= ctx->sequence)
 537                         continue;
 538                 if (!new_ctx->commit_lsn) {
 539                         /*
 540                          * It is still being pushed! Wait for the push to
 541                          * complete, then start again from the beginning.
 542                          */
 543                         xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
 544                         goto restart;
 545                 }
 546         }
 547         spin_unlock(&cil->xc_cil_lock);
 548
 549         /* xfs_log_done always frees the ticket on error. */
 550         commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
 551         if (commit_lsn == -1)
 552                 goto out_abort;
 553
 554         /* attach all the transactions w/ busy extents to iclog */
 555         ctx->log_cb.cb_func = xlog_cil_committed;
 556         ctx->log_cb.cb_arg = ctx;
 557         error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
 558         if (error)
 559                 goto out_abort;
 560
 561         /*
 562          * now the checkpoint commit is complete and we've attached the
 563          * callbacks to the iclog we can assign the commit LSN to the context
 564          * and wake up anyone who is waiting for the commit to complete.
 565          */
 566         spin_lock(&cil->xc_cil_lock);
 567         ctx->commit_lsn = commit_lsn;
 568         wake_up_all(&cil->xc_commit_wait);
 569         spin_unlock(&cil->xc_cil_lock);
 570
 571         /* release the hounds! */
 572         return xfs_log_release_iclog(log->l_mp, commit_iclog);
 573
 574 out_skip:
 575         up_write(&cil->xc_ctx_lock);
 576         xfs_log_ticket_put(new_ctx->ticket);
 577         kmem_free(new_ctx);
 578         return 0;
 579
 580 out_abort_free_ticket:
 581         xfs_log_ticket_put(tic);
 582 out_abort:
 583         xlog_cil_committed(ctx, XFS_LI_ABORTED);
 584         return XFS_ERROR(EIO);
 585 }
 586
 587 static void
 588 xlog_cil_push_work(
 589         struct work_struct      *work)
 590 {
 591         struct xfs_cil          *cil = container_of(work, struct xfs_cil,
 592                                                         xc_push_work);
 593         xlog_cil_push(cil->xc_log);
 594 }
 595
 596 /*
 597  * We need to push CIL every so often so we don't cache more than we can fit in
 598  * the log. The limit really is that a checkpoint can't be more than half the
 599  * log (the current checkpoint is not allowed to overwrite the previous
 600  * checkpoint), but commit latency and memory usage limit this to a smaller
 601  * size.
 602  */
 603 static void
 604 xlog_cil_push_background(
 605         struct log      *log)
 606 {
 607         struct xfs_cil  *cil = log->l_cilp;
 608
 609         /*
 610          * The cil won't be empty because we are called while holding the
 611          * context lock so whatever we added to the CIL will still be there
 612          */
 613         ASSERT(!list_empty(&cil->xc_cil));
 614
 615         /*
 616          * don't do a background push if we haven't used up all the
 617          * space available yet.
 618          */
 619         if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
 620                 return;
 621
 622         spin_lock(&cil->xc_cil_lock);
 623         if (cil->xc_push_seq < cil->xc_current_sequence) {
 624                 cil->xc_push_seq = cil->xc_current_sequence;
 625                 queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
 626         }
 627         spin_unlock(&cil->xc_cil_lock);
 628
 629 }
 630
 631 static void
 632 xlog_cil_push_foreground(
 633         struct log      *log,
 634         xfs_lsn_t       push_seq)
 635 {
 636         struct xfs_cil  *cil = log->l_cilp;
 637
 638         if (!cil)
 639                 return;
 640
 641         ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
 642
 643         /* start on any pending background push to minimise wait time on it */
 644         flush_work(&cil->xc_push_work);
 645
 646         /*
 647          * If the CIL is empty or we've already pushed the sequence then
 648          * there's no work we need to do.
 649          */
 650         spin_lock(&cil->xc_cil_lock);
 651         if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
 652                 spin_unlock(&cil->xc_cil_lock);
 653                 return;
 654         }
 655
 656         cil->xc_push_seq = push_seq;
 657         spin_unlock(&cil->xc_cil_lock);
 658
 659         /* do the push now */
 660         xlog_cil_push(log);
 661 }
 662
 663 /*
 664  * Commit a transaction with the given vector to the Committed Item List.
 665  *
 666  * To do this, we need to format the item, pin it in memory if required and
 667  * account for the space used by the transaction. Once we have done that we
 668  * need to release the unused reservation for the transaction, attach the
 669  * transaction to the checkpoint context so we carry the busy extents through
 670  * to checkpoint completion, and then unlock all the items in the transaction.
 671  *
 672  * For more specific information about the order of operations in
 673  * xfs_log_commit_cil() please refer to the comments in
 674  * xfs_trans_commit_iclog().
 675  *
 676  * Called with the context lock already held in read mode to lock out
 677  * background commit, returns without it held once background commits are
 678  * allowed again.
 679  */
 680 int
 681 xfs_log_commit_cil(
 682         struct xfs_mount        *mp,
 683         struct xfs_trans        *tp,
 684         xfs_lsn_t               *commit_lsn,
 685         int                     flags)
 686 {
 687         struct log              *log = mp->m_log;
 688         int                     log_flags = 0;
 689         struct xfs_log_vec      *log_vector;
 690
 691         if (flags & XFS_TRANS_RELEASE_LOG_RES)
 692                 log_flags = XFS_LOG_REL_PERM_RESERV;
 693
 694         /*
 695          * Do all the hard work of formatting items (including memory
 696          * allocation) outside the CIL context lock. This prevents stalling CIL
 697          * pushes when we are low on memory and a transaction commit spends a
 698          * lot of time in memory reclaim.
 699          */
 700         log_vector = xlog_cil_prepare_log_vecs(tp);
 701         if (!log_vector)
 702                 return ENOMEM;
 703
 704         /* lock out background commit */
 705         down_read(&log->l_cilp->xc_ctx_lock);
 706         if (commit_lsn)
 707                 *commit_lsn = log->l_cilp->xc_ctx->sequence;
 708
 709         xlog_cil_insert_items(log, log_vector, tp->t_ticket);
 710
 711         /* check we didn't blow the reservation */
 712         if (tp->t_ticket->t_curr_res < 0)
 713                 xlog_print_tic_res(log->l_mp, tp->t_ticket);
 714
 715         /* attach the transaction to the CIL if it has any busy extents */
 716         if (!list_empty(&tp->t_busy)) {
 717                 spin_lock(&log->l_cilp->xc_cil_lock);
 718                 list_splice_init(&tp->t_busy,
 719                                         &log->l_cilp->xc_ctx->busy_extents);
 720                 spin_unlock(&log->l_cilp->xc_cil_lock);
 721         }
 722
 723         tp->t_commit_lsn = *commit_lsn;
 724         xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
 725         xfs_trans_unreserve_and_mod_sb(tp);
 726
 727         /*
 728          * Once all the items of the transaction have been copied to the CIL,
 729          * the items can be unlocked and freed.
 730          *
 731          * This needs to be done before we drop the CIL context lock because we
 732          * have to update state in the log items and unlock them before they go
 733          * to disk. If we don't, then the CIL checkpoint can race with us and
 734          * we can run checkpoint completion before we've updated and unlocked
 735          * the log items. This affects (at least) processing of stale buffers,
 736          * inodes and EFIs.
 737          */
 738         xfs_trans_free_items(tp, *commit_lsn, 0);
 739
 740         xlog_cil_push_background(log);
 741
 742         up_read(&log->l_cilp->xc_ctx_lock);
 743         return 0;
 744 }
 745
 746 /*
 747  * Conditionally push the CIL based on the sequence passed in.
 748  *
 749  * We only need to push if we haven't already pushed the sequence
 750  * number given. Hence the only time we will trigger a push here is
 751  * if the push sequence is the same as the current context.
 752  *
 753  * We return the current commit lsn to allow the callers to determine if a
 754  * iclog flush is necessary following this call.
 755  */
 756 xfs_lsn_t
 757 xlog_cil_force_lsn(
 758         struct log      *log,
 759         xfs_lsn_t       sequence)
 760 {
 761         struct xfs_cil          *cil = log->l_cilp;
 762         struct xfs_cil_ctx      *ctx;
 763         xfs_lsn_t               commit_lsn = NULLCOMMITLSN;
 764
 765         ASSERT(sequence <= cil->xc_current_sequence);
 766
 767         /*
 768          * check to see if we need to force out the current context.
 769          * xlog_cil_push() handles racing pushes for the same sequence,
 770          * so no need to deal with it here.
 771          */
 772         xlog_cil_push_foreground(log, sequence);
 773
 774         /*
 775          * See if we can find a previous sequence still committing.
 776          * We need to wait for all previous sequence commits to complete
 777          * before allowing the force of push_seq to go ahead. Hence block
 778          * on commits for those as well.
 779          */
 780 restart:
 781         spin_lock(&cil->xc_cil_lock);
 782         list_for_each_entry(ctx, &cil->xc_committing, committing) {
 783                 if (ctx->sequence > sequence)
 784                         continue;
 785                 if (!ctx->commit_lsn) {
 786                         /*
 787                          * It is still being pushed! Wait for the push to
 788                          * complete, then start again from the beginning.
 789                          */
 790                         xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
 791                         goto restart;
 792                 }
 793                 if (ctx->sequence != sequence)
 794                         continue;
 795                 /* found it! */
 796                 commit_lsn = ctx->commit_lsn;
 797         }
 798         spin_unlock(&cil->xc_cil_lock);
 799         return commit_lsn;
 800 }
 801
 802 /*
 803  * Check if the current log item was first committed in this sequence.
 804  * We can't rely on just the log item being in the CIL, we have to check
 805  * the recorded commit sequence number.
 806  *
 807  * Note: for this to be used in a non-racy manner, it has to be called with
 808  * CIL flushing locked out. As a result, it should only be used during the
 809  * transaction commit process when deciding what to format into the item.
 810  */
 811 bool
 812 xfs_log_item_in_current_chkpt(
 813         struct xfs_log_item *lip)
 814 {
 815         struct xfs_cil_ctx *ctx;
 816
 817         if (list_empty(&lip->li_cil))
 818                 return false;
 819
 820         ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
 821
 822         /*
 823          * li_seq is written on the first commit of a log item to record the
 824          * first checkpoint it is written to. Hence if it is different to the
 825          * current sequence, we're in a new checkpoint.
 826          */
 827         if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
 828                 return false;
 829         return true;
 830 }
 831
 832 /*
 833  * Perform initial CIL structure initialisation.
 834  */
 835 int
 836 xlog_cil_init(
 837         struct log      *log)
 838 {
 839         struct xfs_cil  *cil;
 840         struct xfs_cil_ctx *ctx;
 841
 842         cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
 843         if (!cil)
 844                 return ENOMEM;
 845
 846         ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
 847         if (!ctx) {
 848                 kmem_free(cil);
 849                 return ENOMEM;
 850         }
 851
 852         INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
 853         INIT_LIST_HEAD(&cil->xc_cil);
 854         INIT_LIST_HEAD(&cil->xc_committing);
 855         spin_lock_init(&cil->xc_cil_lock);
 856         init_rwsem(&cil->xc_ctx_lock);
 857         init_waitqueue_head(&cil->xc_commit_wait);
 858
 859         INIT_LIST_HEAD(&ctx->committing);
 860         INIT_LIST_HEAD(&ctx->busy_extents);
 861         ctx->sequence = 1;
 862         ctx->cil = cil;
 863         cil->xc_ctx = ctx;
 864         cil->xc_current_sequence = ctx->sequence;
 865
 866         cil->xc_log = log;
 867         log->l_cilp = cil;
 868         return 0;
 869 }
 870
 871 void
 872 xlog_cil_destroy(
 873         struct log      *log)
 874 {
 875         if (log->l_cilp->xc_ctx) {
 876                 if (log->l_cilp->xc_ctx->ticket)
 877                         xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
 878                 kmem_free(log->l_cilp->xc_ctx);
 879         }
 880
 881         ASSERT(list_empty(&log->l_cilp->xc_cil));
 882         kmem_free(log->l_cilp);
 883 }
 884