fs/ext4/inode.c

   1 /*
   2  *  linux/fs/ext4/inode.c
   3  *
   4  * Copyright (C) 1992, 1993, 1994, 1995
   5  * Remy Card (card@masi.ibp.fr)
   6  * Laboratoire MASI - Institut Blaise Pascal
   7  * Universite Pierre et Marie Curie (Paris VI)
   8  *
   9  *  from
  10  *
  11  *  linux/fs/minix/inode.c
  12  *
  13  *  Copyright (C) 1991, 1992  Linus Torvalds
  14  *
  15  *  Goal-directed block allocation by Stephen Tweedie
  16  *      (sct@redhat.com), 1993, 1998
  17  *  Big-endian to little-endian byte-swapping/bitmaps by
  18  *        David S. Miller (davem@caip.rutgers.edu), 1995
  19  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  20  *      (jj@sunsite.ms.mff.cuni.cz)
  21  *
  22  *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
  23  */
  24
  25 #include <linux/module.h>
  26 #include <linux/fs.h>
  27 #include <linux/time.h>
  28 #include <linux/jbd2.h>
  29 #include <linux/highuid.h>
  30 #include <linux/pagemap.h>
  31 #include <linux/quotaops.h>
  32 #include <linux/string.h>
  33 #include <linux/buffer_head.h>
  34 #include <linux/writeback.h>
  35 #include <linux/pagevec.h>
  36 #include <linux/mpage.h>
  37 #include <linux/uio.h>
  38 #include <linux/bio.h>
  39 #include "ext4_jbd2.h"
  40 #include "xattr.h"
  41 #include "acl.h"
  42
  43 static inline int ext4_begin_ordered_truncate(struct inode *inode,
  44                                               loff_t new_size)
  45 {
  46         return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
  47                                                    new_size);
  48 }
  49
  50 static void ext4_invalidatepage(struct page *page, unsigned long offset);
  51
  52 /*
  53  * Test whether an inode is a fast symlink.
  54  */
  55 static int ext4_inode_is_fast_symlink(struct inode *inode)
  56 {
  57         int ea_blocks = EXT4_I(inode)->i_file_acl ?
  58                 (inode->i_sb->s_blocksize >> 9) : 0;
  59
  60         return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
  61 }
  62
  63 /*
  64  * The ext4 forget function must perform a revoke if we are freeing data
  65  * which has been journaled.  Metadata (eg. indirect blocks) must be
  66  * revoked in all cases.
  67  *
  68  * "bh" may be NULL: a metadata block may have been freed from memory
  69  * but there may still be a record of it in the journal, and that record
  70  * still needs to be revoked.
  71  */
  72 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
  73                         struct buffer_head *bh, ext4_fsblk_t blocknr)
  74 {
  75         int err;
  76
  77         might_sleep();
  78
  79         BUFFER_TRACE(bh, "enter");
  80
  81         jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
  82                   "data mode %lx\n",
  83                   bh, is_metadata, inode->i_mode,
  84                   test_opt(inode->i_sb, DATA_FLAGS));
  85
  86         /* Never use the revoke function if we are doing full data
  87          * journaling: there is no need to, and a V1 superblock won't
  88          * support it.  Otherwise, only skip the revoke on un-journaled
  89          * data blocks. */
  90
  91         if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
  92             (!is_metadata && !ext4_should_journal_data(inode))) {
  93                 if (bh) {
  94                         BUFFER_TRACE(bh, "call jbd2_journal_forget");
  95                         return ext4_journal_forget(handle, bh);
  96                 }
  97                 return 0;
  98         }
  99
 100         /*
 101          * data!=journal && (is_metadata || should_journal_data(inode))
 102          */
 103         BUFFER_TRACE(bh, "call ext4_journal_revoke");
 104         err = ext4_journal_revoke(handle, blocknr, bh);
 105         if (err)
 106                 ext4_abort(inode->i_sb, __func__,
 107                            "error %d when attempting revoke", err);
 108         BUFFER_TRACE(bh, "exit");
 109         return err;
 110 }
 111
 112 /*
 113  * Work out how many blocks we need to proceed with the next chunk of a
 114  * truncate transaction.
 115  */
 116 static unsigned long blocks_for_truncate(struct inode *inode)
 117 {
 118         ext4_lblk_t needed;
 119
 120         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
 121
 122         /* Give ourselves just enough room to cope with inodes in which
 123          * i_blocks is corrupt: we've seen disk corruptions in the past
 124          * which resulted in random data in an inode which looked enough
 125          * like a regular file for ext4 to try to delete it.  Things
 126          * will go a bit crazy if that happens, but at least we should
 127          * try not to panic the whole kernel. */
 128         if (needed < 2)
 129                 needed = 2;
 130
 131         /* But we need to bound the transaction so we don't overflow the
 132          * journal. */
 133         if (needed > EXT4_MAX_TRANS_DATA)
 134                 needed = EXT4_MAX_TRANS_DATA;
 135
 136         return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
 137 }
 138
 139 /*
 140  * Truncate transactions can be complex and absolutely huge.  So we need to
 141  * be able to restart the transaction at a conventient checkpoint to make
 142  * sure we don't overflow the journal.
 143  *
 144  * start_transaction gets us a new handle for a truncate transaction,
 145  * and extend_transaction tries to extend the existing one a bit.  If
 146  * extend fails, we need to propagate the failure up and restart the
 147  * transaction in the top-level truncate loop. --sct
 148  */
 149 static handle_t *start_transaction(struct inode *inode)
 150 {
 151         handle_t *result;
 152
 153         result = ext4_journal_start(inode, blocks_for_truncate(inode));
 154         if (!IS_ERR(result))
 155                 return result;
 156
 157         ext4_std_error(inode->i_sb, PTR_ERR(result));
 158         return result;
 159 }
 160
 161 /*
 162  * Try to extend this transaction for the purposes of truncation.
 163  *
 164  * Returns 0 if we managed to create more room.  If we can't create more
 165  * room, and the transaction must be restarted we return 1.
 166  */
 167 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 168 {
 169         if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
 170                 return 0;
 171         if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
 172                 return 0;
 173         return 1;
 174 }
 175
 176 /*
 177  * Restart the transaction associated with *handle.  This does a commit,
 178  * so before we call here everything must be consistently dirtied against
 179  * this transaction.
 180  */
 181 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
 182 {
 183         jbd_debug(2, "restarting handle %p\n", handle);
 184         return ext4_journal_restart(handle, blocks_for_truncate(inode));
 185 }
 186
 187 /*
 188  * Called at the last iput() if i_nlink is zero.
 189  */
 190 void ext4_delete_inode (struct inode * inode)
 191 {
 192         handle_t *handle;
 193
 194         if (ext4_should_order_data(inode))
 195                 ext4_begin_ordered_truncate(inode, 0);
 196         truncate_inode_pages(&inode->i_data, 0);
 197
 198         if (is_bad_inode(inode))
 199                 goto no_delete;
 200
 201         handle = start_transaction(inode);
 202         if (IS_ERR(handle)) {
 203                 /*
 204                  * If we're going to skip the normal cleanup, we still need to
 205                  * make sure that the in-core orphan linked list is properly
 206                  * cleaned up.
 207                  */
 208                 ext4_orphan_del(NULL, inode);
 209                 goto no_delete;
 210         }
 211
 212         if (IS_SYNC(inode))
 213                 handle->h_sync = 1;
 214         inode->i_size = 0;
 215         if (inode->i_blocks)
 216                 ext4_truncate(inode);
 217         /*
 218          * Kill off the orphan record which ext4_truncate created.
 219          * AKPM: I think this can be inside the above `if'.
 220          * Note that ext4_orphan_del() has to be able to cope with the
 221          * deletion of a non-existent orphan - this is because we don't
 222          * know if ext4_truncate() actually created an orphan record.
 223          * (Well, we could do this if we need to, but heck - it works)
 224          */
 225         ext4_orphan_del(handle, inode);
 226         EXT4_I(inode)->i_dtime  = get_seconds();
 227
 228         /*
 229          * One subtle ordering requirement: if anything has gone wrong
 230          * (transaction abort, IO errors, whatever), then we can still
 231          * do these next steps (the fs will already have been marked as
 232          * having errors), but we can't free the inode if the mark_dirty
 233          * fails.
 234          */
 235         if (ext4_mark_inode_dirty(handle, inode))
 236                 /* If that failed, just do the required in-core inode clear. */
 237                 clear_inode(inode);
 238         else
 239                 ext4_free_inode(handle, inode);
 240         ext4_journal_stop(handle);
 241         return;
 242 no_delete:
 243         clear_inode(inode);     /* We must guarantee clearing of inode... */
 244 }
 245
 246 typedef struct {
 247         __le32  *p;
 248         __le32  key;
 249         struct buffer_head *bh;
 250 } Indirect;
 251
 252 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
 253 {
 254         p->key = *(p->p = v);
 255         p->bh = bh;
 256 }
 257
 258 /**
 259  *      ext4_block_to_path - parse the block number into array of offsets
 260  *      @inode: inode in question (we are only interested in its superblock)
 261  *      @i_block: block number to be parsed
 262  *      @offsets: array to store the offsets in
 263  *      @boundary: set this non-zero if the referred-to block is likely to be
 264  *             followed (on disk) by an indirect block.
 265  *
 266  *      To store the locations of file's data ext4 uses a data structure common
 267  *      for UNIX filesystems - tree of pointers anchored in the inode, with
 268  *      data blocks at leaves and indirect blocks in intermediate nodes.
 269  *      This function translates the block number into path in that tree -
 270  *      return value is the path length and @offsets[n] is the offset of
 271  *      pointer to (n+1)th node in the nth one. If @block is out of range
 272  *      (negative or too large) warning is printed and zero returned.
 273  *
 274  *      Note: function doesn't find node addresses, so no IO is needed. All
 275  *      we need to know is the capacity of indirect blocks (taken from the
 276  *      inode->i_sb).
 277  */
 278
 279 /*
 280  * Portability note: the last comparison (check that we fit into triple
 281  * indirect block) is spelled differently, because otherwise on an
 282  * architecture with 32-bit longs and 8Kb pages we might get into trouble
 283  * if our filesystem had 8Kb blocks. We might use long long, but that would
 284  * kill us on x86. Oh, well, at least the sign propagation does not matter -
 285  * i_block would have to be negative in the very beginning, so we would not
 286  * get there at all.
 287  */
 288
 289 static int ext4_block_to_path(struct inode *inode,
 290                         ext4_lblk_t i_block,
 291                         ext4_lblk_t offsets[4], int *boundary)
 292 {
 293         int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 294         int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
 295         const long direct_blocks = EXT4_NDIR_BLOCKS,
 296                 indirect_blocks = ptrs,
 297                 double_blocks = (1 << (ptrs_bits * 2));
 298         int n = 0;
 299         int final = 0;
 300
 301         if (i_block < 0) {
 302                 ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
 303         } else if (i_block < direct_blocks) {
 304                 offsets[n++] = i_block;
 305                 final = direct_blocks;
 306         } else if ( (i_block -= direct_blocks) < indirect_blocks) {
 307                 offsets[n++] = EXT4_IND_BLOCK;
 308                 offsets[n++] = i_block;
 309                 final = ptrs;
 310         } else if ((i_block -= indirect_blocks) < double_blocks) {
 311                 offsets[n++] = EXT4_DIND_BLOCK;
 312                 offsets[n++] = i_block >> ptrs_bits;
 313                 offsets[n++] = i_block & (ptrs - 1);
 314                 final = ptrs;
 315         } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
 316                 offsets[n++] = EXT4_TIND_BLOCK;
 317                 offsets[n++] = i_block >> (ptrs_bits * 2);
 318                 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
 319                 offsets[n++] = i_block & (ptrs - 1);
 320                 final = ptrs;
 321         } else {
 322                 ext4_warning(inode->i_sb, "ext4_block_to_path",
 323                                 "block %lu > max",
 324                                 i_block + direct_blocks +
 325                                 indirect_blocks + double_blocks);
 326         }
 327         if (boundary)
 328                 *boundary = final - 1 - (i_block & (ptrs - 1));
 329         return n;
 330 }
 331
 332 /**
 333  *      ext4_get_branch - read the chain of indirect blocks leading to data
 334  *      @inode: inode in question
 335  *      @depth: depth of the chain (1 - direct pointer, etc.)
 336  *      @offsets: offsets of pointers in inode/indirect blocks
 337  *      @chain: place to store the result
 338  *      @err: here we store the error value
 339  *
 340  *      Function fills the array of triples <key, p, bh> and returns %NULL
 341  *      if everything went OK or the pointer to the last filled triple
 342  *      (incomplete one) otherwise. Upon the return chain[i].key contains
 343  *      the number of (i+1)-th block in the chain (as it is stored in memory,
 344  *      i.e. little-endian 32-bit), chain[i].p contains the address of that
 345  *      number (it points into struct inode for i==0 and into the bh->b_data
 346  *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
 347  *      block for i>0 and NULL for i==0. In other words, it holds the block
 348  *      numbers of the chain, addresses they were taken from (and where we can
 349  *      verify that chain did not change) and buffer_heads hosting these
 350  *      numbers.
 351  *
 352  *      Function stops when it stumbles upon zero pointer (absent block)
 353  *              (pointer to last triple returned, *@err == 0)
 354  *      or when it gets an IO error reading an indirect block
 355  *              (ditto, *@err == -EIO)
 356  *      or when it reads all @depth-1 indirect blocks successfully and finds
 357  *      the whole chain, all way to the data (returns %NULL, *err == 0).
 358  *
 359  *      Need to be called with
 360  *      down_read(&EXT4_I(inode)->i_data_sem)
 361  */
 362 static Indirect *ext4_get_branch(struct inode *inode, int depth,
 363                                  ext4_lblk_t  *offsets,
 364                                  Indirect chain[4], int *err)
 365 {
 366         struct super_block *sb = inode->i_sb;
 367         Indirect *p = chain;
 368         struct buffer_head *bh;
 369
 370         *err = 0;
 371         /* i_data is not going away, no lock needed */
 372         add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
 373         if (!p->key)
 374                 goto no_block;
 375         while (--depth) {
 376                 bh = sb_bread(sb, le32_to_cpu(p->key));
 377                 if (!bh)
 378                         goto failure;
 379                 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
 380                 /* Reader: end */
 381                 if (!p->key)
 382                         goto no_block;
 383         }
 384         return NULL;
 385
 386 failure:
 387         *err = -EIO;
 388 no_block:
 389         return p;
 390 }
 391
 392 /**
 393  *      ext4_find_near - find a place for allocation with sufficient locality
 394  *      @inode: owner
 395  *      @ind: descriptor of indirect block.
 396  *
 397  *      This function returns the preferred place for block allocation.
 398  *      It is used when heuristic for sequential allocation fails.
 399  *      Rules are:
 400  *        + if there is a block to the left of our position - allocate near it.
 401  *        + if pointer will live in indirect block - allocate near that block.
 402  *        + if pointer will live in inode - allocate in the same
 403  *          cylinder group.
 404  *
 405  * In the latter case we colour the starting block by the callers PID to
 406  * prevent it from clashing with concurrent allocations for a different inode
 407  * in the same block group.   The PID is used here so that functionally related
 408  * files will be close-by on-disk.
 409  *
 410  *      Caller must make sure that @ind is valid and will stay that way.
 411  */
 412 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 413 {
 414         struct ext4_inode_info *ei = EXT4_I(inode);
 415         __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
 416         __le32 *p;
 417         ext4_fsblk_t bg_start;
 418         ext4_fsblk_t last_block;
 419         ext4_grpblk_t colour;
 420
 421         /* Try to find previous block */
 422         for (p = ind->p - 1; p >= start; p--) {
 423                 if (*p)
 424                         return le32_to_cpu(*p);
 425         }
 426
 427         /* No such thing, so let's try location of indirect block */
 428         if (ind->bh)
 429                 return ind->bh->b_blocknr;
 430
 431         /*
 432          * It is going to be referred to from the inode itself? OK, just put it
 433          * into the same cylinder group then.
 434          */
 435         bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
 436         last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 437
 438         if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
 439                 colour = (current->pid % 16) *
 440                         (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 441         else
 442                 colour = (current->pid % 16) * ((last_block - bg_start) / 16);
 443         return bg_start + colour;
 444 }
 445
 446 /**
 447  *      ext4_find_goal - find a preferred place for allocation.
 448  *      @inode: owner
 449  *      @block:  block we want
 450  *      @partial: pointer to the last triple within a chain
 451  *
 452  *      Normally this function find the preferred place for block allocation,
 453  *      returns it.
 454  */
 455 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 456                 Indirect *partial)
 457 {
 458         struct ext4_block_alloc_info *block_i;
 459
 460         block_i =  EXT4_I(inode)->i_block_alloc_info;
 461
 462         /*
 463          * try the heuristic for sequential allocation,
 464          * failing that at least try to get decent locality.
 465          */
 466         if (block_i && (block == block_i->last_alloc_logical_block + 1)
 467                 && (block_i->last_alloc_physical_block != 0)) {
 468                 return block_i->last_alloc_physical_block + 1;
 469         }
 470
 471         return ext4_find_near(inode, partial);
 472 }
 473
 474 /**
 475  *      ext4_blks_to_allocate: Look up the block map and count the number
 476  *      of direct blocks need to be allocated for the given branch.
 477  *
 478  *      @branch: chain of indirect blocks
 479  *      @k: number of blocks need for indirect blocks
 480  *      @blks: number of data blocks to be mapped.
 481  *      @blocks_to_boundary:  the offset in the indirect block
 482  *
 483  *      return the total number of blocks to be allocate, including the
 484  *      direct and indirect blocks.
 485  */
 486 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
 487                 int blocks_to_boundary)
 488 {
 489         unsigned long count = 0;
 490
 491         /*
 492          * Simple case, [t,d]Indirect block(s) has not allocated yet
 493          * then it's clear blocks on that path have not allocated
 494          */
 495         if (k > 0) {
 496                 /* right now we don't handle cross boundary allocation */
 497                 if (blks < blocks_to_boundary + 1)
 498                         count += blks;
 499                 else
 500                         count += blocks_to_boundary + 1;
 501                 return count;
 502         }
 503
 504         count++;
 505         while (count < blks && count <= blocks_to_boundary &&
 506                 le32_to_cpu(*(branch[0].p + count)) == 0) {
 507                 count++;
 508         }
 509         return count;
 510 }
 511
 512 /**
 513  *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
 514  *      @indirect_blks: the number of blocks need to allocate for indirect
 515  *                      blocks
 516  *
 517  *      @new_blocks: on return it will store the new block numbers for
 518  *      the indirect blocks(if needed) and the first direct block,
 519  *      @blks:  on return it will store the total number of allocated
 520  *              direct blocks
 521  */
 522 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 523                                 ext4_lblk_t iblock, ext4_fsblk_t goal,
 524                                 int indirect_blks, int blks,
 525                                 ext4_fsblk_t new_blocks[4], int *err)
 526 {
 527         int target, i;
 528         unsigned long count = 0, blk_allocated = 0;
 529         int index = 0;
 530         ext4_fsblk_t current_block = 0;
 531         int ret = 0;
 532
 533         /*
 534          * Here we try to allocate the requested multiple blocks at once,
 535          * on a best-effort basis.
 536          * To build a branch, we should allocate blocks for
 537          * the indirect blocks(if not allocated yet), and at least
 538          * the first direct block of this branch.  That's the
 539          * minimum number of blocks need to allocate(required)
 540          */
 541         /* first we try to allocate the indirect blocks */
 542         target = indirect_blks;
 543         while (target > 0) {
 544                 count = target;
 545                 /* allocating blocks for indirect blocks and direct blocks */
 546                 current_block = ext4_new_meta_blocks(handle, inode,
 547                                                         goal, &count, err);
 548                 if (*err)
 549                         goto failed_out;
 550
 551                 target -= count;
 552                 /* allocate blocks for indirect blocks */
 553                 while (index < indirect_blks && count) {
 554                         new_blocks[index++] = current_block++;
 555                         count--;
 556                 }
 557                 if (count > 0) {
 558                         /*
 559                          * save the new block number
 560                          * for the first direct block
 561                          */
 562                         new_blocks[index] = current_block;
 563                         printk(KERN_INFO "%s returned more blocks than "
 564                                                 "requested\n", __func__);
 565                         WARN_ON(1);
 566                         break;
 567                 }
 568         }
 569
 570         target = blks - count ;
 571         blk_allocated = count;
 572         if (!target)
 573                 goto allocated;
 574         /* Now allocate data blocks */
 575         count = target;
 576         /* allocating blocks for data blocks */
 577         current_block = ext4_new_blocks(handle, inode, iblock,
 578                                                 goal, &count, err);
 579         if (*err && (target == blks)) {
 580                 /*
 581                  * if the allocation failed and we didn't allocate
 582                  * any blocks before
 583                  */
 584                 goto failed_out;
 585         }
 586         if (!*err) {
 587                 if (target == blks) {
 588                 /*
 589                  * save the new block number
 590                  * for the first direct block
 591                  */
 592                         new_blocks[index] = current_block;
 593                 }
 594                 blk_allocated += count;
 595         }
 596 allocated:
 597         /* total number of blocks allocated for direct blocks */
 598         ret = blk_allocated;
 599         *err = 0;
 600         return ret;
 601 failed_out:
 602         for (i = 0; i <index; i++)
 603                 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 604         return ret;
 605 }
 606
 607 /**
 608  *      ext4_alloc_branch - allocate and set up a chain of blocks.
 609  *      @inode: owner
 610  *      @indirect_blks: number of allocated indirect blocks
 611  *      @blks: number of allocated direct blocks
 612  *      @offsets: offsets (in the blocks) to store the pointers to next.
 613  *      @branch: place to store the chain in.
 614  *
 615  *      This function allocates blocks, zeroes out all but the last one,
 616  *      links them into chain and (if we are synchronous) writes them to disk.
 617  *      In other words, it prepares a branch that can be spliced onto the
 618  *      inode. It stores the information about that chain in the branch[], in
 619  *      the same format as ext4_get_branch() would do. We are calling it after
 620  *      we had read the existing part of chain and partial points to the last
 621  *      triple of that (one with zero ->key). Upon the exit we have the same
 622  *      picture as after the successful ext4_get_block(), except that in one
 623  *      place chain is disconnected - *branch->p is still zero (we did not
 624  *      set the last link), but branch->key contains the number that should
 625  *      be placed into *branch->p to fill that gap.
 626  *
 627  *      If allocation fails we free all blocks we've allocated (and forget
 628  *      their buffer_heads) and return the error value the from failed
 629  *      ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
 630  *      as described above and return 0.
 631  */
 632 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 633                                 ext4_lblk_t iblock, int indirect_blks,
 634                                 int *blks, ext4_fsblk_t goal,
 635                                 ext4_lblk_t *offsets, Indirect *branch)
 636 {
 637         int blocksize = inode->i_sb->s_blocksize;
 638         int i, n = 0;
 639         int err = 0;
 640         struct buffer_head *bh;
 641         int num;
 642         ext4_fsblk_t new_blocks[4];
 643         ext4_fsblk_t current_block;
 644
 645         num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
 646                                 *blks, new_blocks, &err);
 647         if (err)
 648                 return err;
 649
 650         branch[0].key = cpu_to_le32(new_blocks[0]);
 651         /*
 652          * metadata blocks and data blocks are allocated.
 653          */
 654         for (n = 1; n <= indirect_blks;  n++) {
 655                 /*
 656                  * Get buffer_head for parent block, zero it out
 657                  * and set the pointer to new one, then send
 658                  * parent to disk.
 659                  */
 660                 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
 661                 branch[n].bh = bh;
 662                 lock_buffer(bh);
 663                 BUFFER_TRACE(bh, "call get_create_access");
 664                 err = ext4_journal_get_create_access(handle, bh);
 665                 if (err) {
 666                         unlock_buffer(bh);
 667                         brelse(bh);
 668                         goto failed;
 669                 }
 670
 671                 memset(bh->b_data, 0, blocksize);
 672                 branch[n].p = (__le32 *) bh->b_data + offsets[n];
 673                 branch[n].key = cpu_to_le32(new_blocks[n]);
 674                 *branch[n].p = branch[n].key;
 675                 if ( n == indirect_blks) {
 676                         current_block = new_blocks[n];
 677                         /*
 678                          * End of chain, update the last new metablock of
 679                          * the chain to point to the new allocated
 680                          * data blocks numbers
 681                          */
 682                         for (i=1; i < num; i++)
 683                                 *(branch[n].p + i) = cpu_to_le32(++current_block);
 684                 }
 685                 BUFFER_TRACE(bh, "marking uptodate");
 686                 set_buffer_uptodate(bh);
 687                 unlock_buffer(bh);
 688
 689                 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
 690                 err = ext4_journal_dirty_metadata(handle, bh);
 691                 if (err)
 692                         goto failed;
 693         }
 694         *blks = num;
 695         return err;
 696 failed:
 697         /* Allocation failed, free what we already allocated */
 698         for (i = 1; i <= n ; i++) {
 699                 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
 700                 ext4_journal_forget(handle, branch[i].bh);
 701         }
 702         for (i = 0; i <indirect_blks; i++)
 703                 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 704
 705         ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
 706
 707         return err;
 708 }
 709
 710 /**
 711  * ext4_splice_branch - splice the allocated branch onto inode.
 712  * @inode: owner
 713  * @block: (logical) number of block we are adding
 714  * @chain: chain of indirect blocks (with a missing link - see
 715  *      ext4_alloc_branch)
 716  * @where: location of missing link
 717  * @num:   number of indirect blocks we are adding
 718  * @blks:  number of direct blocks we are adding
 719  *
 720  * This function fills the missing link and does all housekeeping needed in
 721  * inode (->i_blocks, etc.). In case of success we end up with the full
 722  * chain to new block and return 0.
 723  */
 724 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 725                         ext4_lblk_t block, Indirect *where, int num, int blks)
 726 {
 727         int i;
 728         int err = 0;
 729         struct ext4_block_alloc_info *block_i;
 730         ext4_fsblk_t current_block;
 731
 732         block_i = EXT4_I(inode)->i_block_alloc_info;
 733         /*
 734          * If we're splicing into a [td]indirect block (as opposed to the
 735          * inode) then we need to get write access to the [td]indirect block
 736          * before the splice.
 737          */
 738         if (where->bh) {
 739                 BUFFER_TRACE(where->bh, "get_write_access");
 740                 err = ext4_journal_get_write_access(handle, where->bh);
 741                 if (err)
 742                         goto err_out;
 743         }
 744         /* That's it */
 745
 746         *where->p = where->key;
 747
 748         /*
 749          * Update the host buffer_head or inode to point to more just allocated
 750          * direct blocks blocks
 751          */
 752         if (num == 0 && blks > 1) {
 753                 current_block = le32_to_cpu(where->key) + 1;
 754                 for (i = 1; i < blks; i++)
 755                         *(where->p + i ) = cpu_to_le32(current_block++);
 756         }
 757
 758         /*
 759          * update the most recently allocated logical & physical block
 760          * in i_block_alloc_info, to assist find the proper goal block for next
 761          * allocation
 762          */
 763         if (block_i) {
 764                 block_i->last_alloc_logical_block = block + blks - 1;
 765                 block_i->last_alloc_physical_block =
 766                                 le32_to_cpu(where[num].key) + blks - 1;
 767         }
 768
 769         /* We are done with atomic stuff, now do the rest of housekeeping */
 770
 771         inode->i_ctime = ext4_current_time(inode);
 772         ext4_mark_inode_dirty(handle, inode);
 773
 774         /* had we spliced it onto indirect block? */
 775         if (where->bh) {
 776                 /*
 777                  * If we spliced it onto an indirect block, we haven't
 778                  * altered the inode.  Note however that if it is being spliced
 779                  * onto an indirect block at the very end of the file (the
 780                  * file is growing) then we *will* alter the inode to reflect
 781                  * the new i_size.  But that is not done here - it is done in
 782                  * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
 783                  */
 784                 jbd_debug(5, "splicing indirect only\n");
 785                 BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
 786                 err = ext4_journal_dirty_metadata(handle, where->bh);
 787                 if (err)
 788                         goto err_out;
 789         } else {
 790                 /*
 791                  * OK, we spliced it into the inode itself on a direct block.
 792                  * Inode was dirtied above.
 793                  */
 794                 jbd_debug(5, "splicing direct\n");
 795         }
 796         return err;
 797
 798 err_out:
 799         for (i = 1; i <= num; i++) {
 800                 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
 801                 ext4_journal_forget(handle, where[i].bh);
 802                 ext4_free_blocks(handle, inode,
 803                                         le32_to_cpu(where[i-1].key), 1, 0);
 804         }
 805         ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
 806
 807         return err;
 808 }
 809
 810 /*
 811  * Allocation strategy is simple: if we have to allocate something, we will
 812  * have to go the whole way to leaf. So let's do it before attaching anything
 813  * to tree, set linkage between the newborn blocks, write them if sync is
 814  * required, recheck the path, free and repeat if check fails, otherwise
 815  * set the last missing link (that will protect us from any truncate-generated
 816  * removals - all blocks on the path are immune now) and possibly force the
 817  * write on the parent block.
 818  * That has a nice additional property: no special recovery from the failed
 819  * allocations is needed - we simply release blocks and do not touch anything
 820  * reachable from inode.
 821  *
 822  * `handle' can be NULL if create == 0.
 823  *
 824  * return > 0, # of blocks mapped or allocated.
 825  * return = 0, if plain lookup failed.
 826  * return < 0, error case.
 827  *
 828  *
 829  * Need to be called with
 830  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
 831  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
 832  */
 833 int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 834                 ext4_lblk_t iblock, unsigned long maxblocks,
 835                 struct buffer_head *bh_result,
 836                 int create, int extend_disksize)
 837 {
 838         int err = -EIO;
 839         ext4_lblk_t offsets[4];
 840         Indirect chain[4];
 841         Indirect *partial;
 842         ext4_fsblk_t goal;
 843         int indirect_blks;
 844         int blocks_to_boundary = 0;
 845         int depth;
 846         struct ext4_inode_info *ei = EXT4_I(inode);
 847         int count = 0;
 848         ext4_fsblk_t first_block = 0;
 849
 850
 851         J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
 852         J_ASSERT(handle != NULL || create == 0);
 853         depth = ext4_block_to_path(inode, iblock, offsets,
 854                                         &blocks_to_boundary);
 855
 856         if (depth == 0)
 857                 goto out;
 858
 859         partial = ext4_get_branch(inode, depth, offsets, chain, &err);
 860
 861         /* Simplest case - block found, no allocation needed */
 862         if (!partial) {
 863                 first_block = le32_to_cpu(chain[depth - 1].key);
 864                 clear_buffer_new(bh_result);
 865                 count++;
 866                 /*map more blocks*/
 867                 while (count < maxblocks && count <= blocks_to_boundary) {
 868                         ext4_fsblk_t blk;
 869
 870                         blk = le32_to_cpu(*(chain[depth-1].p + count));
 871
 872                         if (blk == first_block + count)
 873                                 count++;
 874                         else
 875                                 break;
 876                 }
 877                 goto got_it;
 878         }
 879
 880         /* Next simple case - plain lookup or failed read of indirect block */
 881         if (!create || err == -EIO)
 882                 goto cleanup;
 883
 884         /*
 885          * Okay, we need to do block allocation.  Lazily initialize the block
 886          * allocation info here if necessary
 887         */
 888         if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
 889                 ext4_init_block_alloc_info(inode);
 890
 891         goal = ext4_find_goal(inode, iblock, partial);
 892
 893         /* the number of blocks need to allocate for [d,t]indirect blocks */
 894         indirect_blks = (chain + depth) - partial - 1;
 895
 896         /*
 897          * Next look up the indirect map to count the totoal number of
 898          * direct blocks to allocate for this branch.
 899          */
 900         count = ext4_blks_to_allocate(partial, indirect_blks,
 901                                         maxblocks, blocks_to_boundary);
 902         /*
 903          * Block out ext4_truncate while we alter the tree
 904          */
 905         err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
 906                                         &count, goal,
 907                                         offsets + (partial - chain), partial);
 908
 909         /*
 910          * The ext4_splice_branch call will free and forget any buffers
 911          * on the new chain if there is a failure, but that risks using
 912          * up transaction credits, especially for bitmaps where the
 913          * credits cannot be returned.  Can we handle this somehow?  We
 914          * may need to return -EAGAIN upwards in the worst case.  --sct
 915          */
 916         if (!err)
 917                 err = ext4_splice_branch(handle, inode, iblock,
 918                                         partial, indirect_blks, count);
 919         /*
 920          * i_disksize growing is protected by i_data_sem.  Don't forget to
 921          * protect it if you're about to implement concurrent
 922          * ext4_get_block() -bzzz
 923         */
 924         if (!err && extend_disksize && inode->i_size > ei->i_disksize)
 925                 ei->i_disksize = inode->i_size;
 926         if (err)
 927                 goto cleanup;
 928
 929         set_buffer_new(bh_result);
 930 got_it:
 931         map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
 932         if (count > blocks_to_boundary)
 933                 set_buffer_boundary(bh_result);
 934         err = count;
 935         /* Clean up and exit */
 936         partial = chain + depth - 1;    /* the whole chain */
 937 cleanup:
 938         while (partial > chain) {
 939                 BUFFER_TRACE(partial->bh, "call brelse");
 940                 brelse(partial->bh);
 941                 partial--;
 942         }
 943         BUFFER_TRACE(bh_result, "returned");
 944 out:
 945         return err;
 946 }
 947
 948 /* Maximum number of blocks we map for direct IO at once. */
 949 #define DIO_MAX_BLOCKS 4096
 950 /*
 951  * Number of credits we need for writing DIO_MAX_BLOCKS:
 952  * We need sb + group descriptor + bitmap + inode -> 4
 953  * For B blocks with A block pointers per block we need:
 954  * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
 955  * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
 956  */
 957 #define DIO_CREDITS 25
 958
 959
 960 /*
 961  *
 962  *
 963  * ext4_ext4 get_block() wrapper function
 964  * It will do a look up first, and returns if the blocks already mapped.
 965  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 966  * and store the allocated blocks in the result buffer head and mark it
 967  * mapped.
 968  *
 969  * If file type is extents based, it will call ext4_ext_get_blocks(),
 970  * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
 971  * based files
 972  *
 973  * On success, it returns the number of blocks being mapped or allocate.
 974  * if create==0 and the blocks are pre-allocated and uninitialized block,
 975  * the result buffer head is unmapped. If the create ==1, it will make sure
 976  * the buffer head is mapped.
 977  *
 978  * It returns 0 if plain look up failed (blocks have not been allocated), in
 979  * that casem, buffer head is unmapped
 980  *
 981  * It returns the error in case of allocation failure.
 982  */
 983 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 984                         unsigned long max_blocks, struct buffer_head *bh,
 985                         int create, int extend_disksize)
 986 {
 987         int retval;
 988
 989         clear_buffer_mapped(bh);
 990
 991         /*
 992          * Try to see if we can get  the block without requesting
 993          * for new file system block.
 994          */
 995         down_read((&EXT4_I(inode)->i_data_sem));
 996         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
 997                 retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
 998                                 bh, 0, 0);
 999         } else {
1000                 retval = ext4_get_blocks_handle(handle,
1001                                 inode, block, max_blocks, bh, 0, 0);
1002         }
1003         up_read((&EXT4_I(inode)->i_data_sem));
1004
1005         /* If it is only a block(s) look up */
1006         if (!create)
1007                 return retval;
1008
1009         /*
1010          * Returns if the blocks have already allocated
1011          *
1012          * Note that if blocks have been preallocated
1013          * ext4_ext_get_block() returns th create = 0
1014          * with buffer head unmapped.
1015          */
1016         if (retval > 0 && buffer_mapped(bh))
1017                 return retval;
1018
1019         /*
1020          * New blocks allocate and/or writing to uninitialized extent
1021          * will possibly result in updating i_data, so we take
1022          * the write lock of i_data_sem, and call get_blocks()
1023          * with create == 1 flag.
1024          */
1025         down_write((&EXT4_I(inode)->i_data_sem));
1026         /*
1027          * We need to check for EXT4 here because migrate
1028          * could have changed the inode type in between
1029          */
1030         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
1031                 retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
1032                                 bh, create, extend_disksize);
1033         } else {
1034                 retval = ext4_get_blocks_handle(handle, inode, block,
1035                                 max_blocks, bh, create, extend_disksize);
1036
1037                 if (retval > 0 && buffer_new(bh)) {
1038                         /*
1039                          * We allocated new blocks which will result in
1040                          * i_data's format changing.  Force the migrate
1041                          * to fail by clearing migrate flags
1042                          */
1043                         EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
1044                                                         ~EXT4_EXT_MIGRATE;
1045                 }
1046         }
1047         up_write((&EXT4_I(inode)->i_data_sem));
1048         return retval;
1049 }
1050
1051 static int ext4_get_block(struct inode *inode, sector_t iblock,
1052                         struct buffer_head *bh_result, int create)
1053 {
1054         handle_t *handle = ext4_journal_current_handle();
1055         int ret = 0, started = 0;
1056         unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1057
1058         if (create && !handle) {
1059                 /* Direct IO write... */
1060                 if (max_blocks > DIO_MAX_BLOCKS)
1061                         max_blocks = DIO_MAX_BLOCKS;
1062                 handle = ext4_journal_start(inode, DIO_CREDITS +
1063                               2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
1064                 if (IS_ERR(handle)) {
1065                         ret = PTR_ERR(handle);
1066                         goto out;
1067                 }
1068                 started = 1;
1069         }
1070
1071         ret = ext4_get_blocks_wrap(handle, inode, iblock,
1072                                         max_blocks, bh_result, create, 0);
1073         if (ret > 0) {
1074                 bh_result->b_size = (ret << inode->i_blkbits);
1075                 ret = 0;
1076         }
1077         if (started)
1078                 ext4_journal_stop(handle);
1079 out:
1080         return ret;
1081 }
1082
1083 /*
1084  * `handle' can be NULL if create is zero
1085  */
1086 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1087                                 ext4_lblk_t block, int create, int *errp)
1088 {
1089         struct buffer_head dummy;
1090         int fatal = 0, err;
1091
1092         J_ASSERT(handle != NULL || create == 0);
1093
1094         dummy.b_state = 0;
1095         dummy.b_blocknr = -1000;
1096         buffer_trace_init(&dummy.b_history);
1097         err = ext4_get_blocks_wrap(handle, inode, block, 1,
1098                                         &dummy, create, 1);
1099         /*
1100          * ext4_get_blocks_handle() returns number of blocks
1101          * mapped. 0 in case of a HOLE.
1102          */
1103         if (err > 0) {
1104                 if (err > 1)
1105                         WARN_ON(1);
1106                 err = 0;
1107         }
1108         *errp = err;
1109         if (!err && buffer_mapped(&dummy)) {
1110                 struct buffer_head *bh;
1111                 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1112                 if (!bh) {
1113                         *errp = -EIO;
1114                         goto err;
1115                 }
1116                 if (buffer_new(&dummy)) {
1117                         J_ASSERT(create != 0);
1118                         J_ASSERT(handle != NULL);
1119
1120                         /*
1121                          * Now that we do not always journal data, we should
1122                          * keep in mind whether this should always journal the
1123                          * new buffer as metadata.  For now, regular file
1124                          * writes use ext4_get_block instead, so it's not a
1125                          * problem.
1126                          */
1127                         lock_buffer(bh);
1128                         BUFFER_TRACE(bh, "call get_create_access");
1129                         fatal = ext4_journal_get_create_access(handle, bh);
1130                         if (!fatal && !buffer_uptodate(bh)) {
1131                                 memset(bh->b_data,0,inode->i_sb->s_blocksize);
1132                                 set_buffer_uptodate(bh);
1133                         }
1134                         unlock_buffer(bh);
1135                         BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
1136                         err = ext4_journal_dirty_metadata(handle, bh);
1137                         if (!fatal)
1138                                 fatal = err;
1139                 } else {
1140                         BUFFER_TRACE(bh, "not a new buffer");
1141                 }
1142                 if (fatal) {
1143                         *errp = fatal;
1144                         brelse(bh);
1145                         bh = NULL;
1146                 }
1147                 return bh;
1148         }
1149 err:
1150         return NULL;
1151 }
1152
1153 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1154                                ext4_lblk_t block, int create, int *err)
1155 {
1156         struct buffer_head * bh;
1157
1158         bh = ext4_getblk(handle, inode, block, create, err);
1159         if (!bh)
1160                 return bh;
1161         if (buffer_uptodate(bh))
1162                 return bh;
1163         ll_rw_block(READ_META, 1, &bh);
1164         wait_on_buffer(bh);
1165         if (buffer_uptodate(bh))
1166                 return bh;
1167         put_bh(bh);
1168         *err = -EIO;
1169         return NULL;
1170 }
1171
1172 static int walk_page_buffers(   handle_t *handle,
1173                                 struct buffer_head *head,
1174                                 unsigned from,
1175                                 unsigned to,
1176                                 int *partial,
1177                                 int (*fn)(      handle_t *handle,
1178                                                 struct buffer_head *bh))
1179 {
1180         struct buffer_head *bh;
1181         unsigned block_start, block_end;
1182         unsigned blocksize = head->b_size;
1183         int err, ret = 0;
1184         struct buffer_head *next;
1185
1186         for (   bh = head, block_start = 0;
1187                 ret == 0 && (bh != head || !block_start);
1188                 block_start = block_end, bh = next)
1189         {
1190                 next = bh->b_this_page;
1191                 block_end = block_start + blocksize;
1192                 if (block_end <= from || block_start >= to) {
1193                         if (partial && !buffer_uptodate(bh))
1194                                 *partial = 1;
1195                         continue;
1196                 }
1197                 err = (*fn)(handle, bh);
1198                 if (!ret)
1199                         ret = err;
1200         }
1201         return ret;
1202 }
1203
1204 /*
1205  * To preserve ordering, it is essential that the hole instantiation and
1206  * the data write be encapsulated in a single transaction.  We cannot
1207  * close off a transaction and start a new one between the ext4_get_block()
1208  * and the commit_write().  So doing the jbd2_journal_start at the start of
1209  * prepare_write() is the right place.
1210  *
1211  * Also, this function can nest inside ext4_writepage() ->
1212  * block_write_full_page(). In that case, we *know* that ext4_writepage()
1213  * has generated enough buffer credits to do the whole page.  So we won't
1214  * block on the journal in that case, which is good, because the caller may
1215  * be PF_MEMALLOC.
1216  *
1217  * By accident, ext4 can be reentered when a transaction is open via
1218  * quota file writes.  If we were to commit the transaction while thus
1219  * reentered, there can be a deadlock - we would be holding a quota
1220  * lock, and the commit would never complete if another thread had a
1221  * transaction open and was blocking on the quota lock - a ranking
1222  * violation.
1223  *
1224  * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
1225  * will _not_ run commit under these circumstances because handle->h_ref
1226  * is elevated.  We'll still have enough credits for the tiny quotafile
1227  * write.
1228  */
1229 static int do_journal_get_write_access(handle_t *handle,
1230                                         struct buffer_head *bh)
1231 {
1232         if (!buffer_mapped(bh) || buffer_freed(bh))
1233                 return 0;
1234         return ext4_journal_get_write_access(handle, bh);
1235 }
1236
1237 static int ext4_write_begin(struct file *file, struct address_space *mapping,
1238                                 loff_t pos, unsigned len, unsigned flags,
1239                                 struct page **pagep, void **fsdata)
1240 {
1241         struct inode *inode = mapping->host;
1242         int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
1243         handle_t *handle;
1244         int retries = 0;
1245         struct page *page;
1246         pgoff_t index;
1247         unsigned from, to;
1248
1249         index = pos >> PAGE_CACHE_SHIFT;
1250         from = pos & (PAGE_CACHE_SIZE - 1);
1251         to = from + len;
1252
1253 retry:
1254         handle = ext4_journal_start(inode, needed_blocks);
1255         if (IS_ERR(handle)) {
1256                 ret = PTR_ERR(handle);
1257                 goto out;
1258         }
1259
1260         page = __grab_cache_page(mapping, index);
1261         if (!page) {
1262                 ext4_journal_stop(handle);
1263                 ret = -ENOMEM;
1264                 goto out;
1265         }
1266         *pagep = page;
1267
1268         ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1269                                                         ext4_get_block);
1270
1271         if (!ret && ext4_should_journal_data(inode)) {
1272                 ret = walk_page_buffers(handle, page_buffers(page),
1273                                 from, to, NULL, do_journal_get_write_access);
1274         }
1275
1276         if (ret) {
1277                 unlock_page(page);
1278                 ext4_journal_stop(handle);
1279                 page_cache_release(page);
1280         }
1281
1282         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1283                 goto retry;
1284 out:
1285         return ret;
1286 }
1287
1288 /* For write_end() in data=journal mode */
1289 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1290 {
1291         if (!buffer_mapped(bh) || buffer_freed(bh))
1292                 return 0;
1293         set_buffer_uptodate(bh);
1294         return ext4_journal_dirty_metadata(handle, bh);
1295 }
1296
1297 /*
1298  * We need to pick up the new inode size which generic_commit_write gave us
1299  * `file' can be NULL - eg, when called from page_symlink().
1300  *
1301  * ext4 never places buffers on inode->i_mapping->private_list.  metadata
1302  * buffers are managed internally.
1303  */
1304 static int ext4_ordered_write_end(struct file *file,
1305                                 struct address_space *mapping,
1306                                 loff_t pos, unsigned len, unsigned copied,
1307                                 struct page *page, void *fsdata)
1308 {
1309         handle_t *handle = ext4_journal_current_handle();
1310         struct inode *inode = mapping->host;
1311         unsigned from, to;
1312         int ret = 0, ret2;
1313
1314         from = pos & (PAGE_CACHE_SIZE - 1);
1315         to = from + len;
1316
1317         ret = ext4_jbd2_file_inode(handle, inode);
1318
1319         if (ret == 0) {
1320                 /*
1321                  * generic_write_end() will run mark_inode_dirty() if i_size
1322                  * changes.  So let's piggyback the i_disksize mark_inode_dirty
1323                  * into that.
1324                  */
1325                 loff_t new_i_size;
1326
1327                 new_i_size = pos + copied;
1328                 if (new_i_size > EXT4_I(inode)->i_disksize)
1329                         EXT4_I(inode)->i_disksize = new_i_size;
1330                 ret2 = generic_write_end(file, mapping, pos, len, copied,
1331                                                         page, fsdata);
1332                 copied = ret2;
1333                 if (ret2 < 0)
1334                         ret = ret2;
1335         }
1336         ret2 = ext4_journal_stop(handle);
1337         if (!ret)
1338                 ret = ret2;
1339
1340         return ret ? ret : copied;
1341 }
1342
1343 static int ext4_writeback_write_end(struct file *file,
1344                                 struct address_space *mapping,
1345                                 loff_t pos, unsigned len, unsigned copied,
1346                                 struct page *page, void *fsdata)
1347 {
1348         handle_t *handle = ext4_journal_current_handle();
1349         struct inode *inode = mapping->host;
1350         int ret = 0, ret2;
1351         loff_t new_i_size;
1352
1353         new_i_size = pos + copied;
1354         if (new_i_size > EXT4_I(inode)->i_disksize)
1355                 EXT4_I(inode)->i_disksize = new_i_size;
1356
1357         ret2 = generic_write_end(file, mapping, pos, len, copied,
1358                                                         page, fsdata);
1359         copied = ret2;
1360         if (ret2 < 0)
1361                 ret = ret2;
1362
1363         ret2 = ext4_journal_stop(handle);
1364         if (!ret)
1365                 ret = ret2;
1366
1367         return ret ? ret : copied;
1368 }
1369
1370 static int ext4_journalled_write_end(struct file *file,
1371                                 struct address_space *mapping,
1372                                 loff_t pos, unsigned len, unsigned copied,
1373                                 struct page *page, void *fsdata)
1374 {
1375         handle_t *handle = ext4_journal_current_handle();
1376         struct inode *inode = mapping->host;
1377         int ret = 0, ret2;
1378         int partial = 0;
1379         unsigned from, to;
1380
1381         from = pos & (PAGE_CACHE_SIZE - 1);
1382         to = from + len;
1383
1384         if (copied < len) {
1385                 if (!PageUptodate(page))
1386                         copied = 0;
1387                 page_zero_new_buffers(page, from+copied, to);
1388         }
1389
1390         ret = walk_page_buffers(handle, page_buffers(page), from,
1391                                 to, &partial, write_end_fn);
1392         if (!partial)
1393                 SetPageUptodate(page);
1394         if (pos+copied > inode->i_size)
1395                 i_size_write(inode, pos+copied);
1396         EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1397         if (inode->i_size > EXT4_I(inode)->i_disksize) {
1398                 EXT4_I(inode)->i_disksize = inode->i_size;
1399                 ret2 = ext4_mark_inode_dirty(handle, inode);
1400                 if (!ret)
1401                         ret = ret2;
1402         }
1403
1404         unlock_page(page);
1405         ret2 = ext4_journal_stop(handle);
1406         if (!ret)
1407                 ret = ret2;
1408         page_cache_release(page);
1409
1410         return ret ? ret : copied;
1411 }
1412
1413 /*
1414  * Delayed allocation stuff
1415  */
1416
1417 struct mpage_da_data {
1418         struct inode *inode;
1419         struct buffer_head lbh;                 /* extent of blocks */
1420         unsigned long first_page, next_page;    /* extent of pages */
1421         get_block_t *get_block;
1422         struct writeback_control *wbc;
1423 };
1424
1425 /*
1426  * mpage_da_submit_io - walks through extent of pages and try to write
1427  * them with __mpage_writepage()
1428  *
1429  * @mpd->inode: inode
1430  * @mpd->first_page: first page of the extent
1431  * @mpd->next_page: page after the last page of the extent
1432  * @mpd->get_block: the filesystem's block mapper function
1433  *
1434  * By the time mpage_da_submit_io() is called we expect all blocks
1435  * to be allocated. this may be wrong if allocation failed.
1436  *
1437  * As pages are already locked by write_cache_pages(), we can't use it
1438  */
1439 static int mpage_da_submit_io(struct mpage_da_data *mpd)
1440 {
1441         struct address_space *mapping = mpd->inode->i_mapping;
1442         struct mpage_data mpd_pp = {
1443                 .bio = NULL,
1444                 .last_block_in_bio = 0,
1445                 .get_block = mpd->get_block,
1446                 .use_writepage = 1,
1447         };
1448         int ret = 0, err, nr_pages, i;
1449         unsigned long index, end;
1450         struct pagevec pvec;
1451
1452         BUG_ON(mpd->next_page <= mpd->first_page);
1453
1454         pagevec_init(&pvec, 0);
1455         index = mpd->first_page;
1456         end = mpd->next_page - 1;
1457
1458         while (index <= end) {
1459                 /* XXX: optimize tail */
1460                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1461                 if (nr_pages == 0)
1462                         break;
1463                 for (i = 0; i < nr_pages; i++) {
1464                         struct page *page = pvec.pages[i];
1465
1466                         index = page->index;
1467                         if (index > end)
1468                                 break;
1469                         index++;
1470
1471                         err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
1472
1473                         /*
1474                          * In error case, we have to continue because
1475                          * remaining pages are still locked
1476                          * XXX: unlock and re-dirty them?
1477                          */
1478                         if (ret == 0)
1479                                 ret = err;
1480                 }
1481                 pagevec_release(&pvec);
1482         }
1483         if (mpd_pp.bio)
1484                 mpage_bio_submit(WRITE, mpd_pp.bio);
1485
1486         return ret;
1487 }
1488
1489 /*
1490  * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
1491  *
1492  * @mpd->inode - inode to walk through
1493  * @exbh->b_blocknr - first block on a disk
1494  * @exbh->b_size - amount of space in bytes
1495  * @logical - first logical block to start assignment with
1496  *
1497  * the function goes through all passed space and put actual disk
1498  * block numbers into buffer heads, dropping BH_Delay
1499  */
1500 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1501                                  struct buffer_head *exbh)
1502 {
1503         struct inode *inode = mpd->inode;
1504         struct address_space *mapping = inode->i_mapping;
1505         int blocks = exbh->b_size >> inode->i_blkbits;
1506         sector_t pblock = exbh->b_blocknr, cur_logical;
1507         struct buffer_head *head, *bh;
1508         unsigned long index, end;
1509         struct pagevec pvec;
1510         int nr_pages, i;
1511
1512         index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1513         end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1514         cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1515
1516         pagevec_init(&pvec, 0);
1517
1518         while (index <= end) {
1519                 /* XXX: optimize tail */
1520                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1521                 if (nr_pages == 0)
1522                         break;
1523                 for (i = 0; i < nr_pages; i++) {
1524                         struct page *page = pvec.pages[i];
1525
1526                         index = page->index;
1527                         if (index > end)
1528                                 break;
1529                         index++;
1530
1531                         BUG_ON(!PageLocked(page));
1532                         BUG_ON(PageWriteback(page));
1533                         BUG_ON(!page_has_buffers(page));
1534
1535                         bh = page_buffers(page);
1536                         head = bh;
1537
1538                         /* skip blocks out of the range */
1539                         do {
1540                                 if (cur_logical >= logical)
1541                                         break;
1542                                 cur_logical++;
1543                         } while ((bh = bh->b_this_page) != head);
1544
1545                         do {
1546                                 if (cur_logical >= logical + blocks)
1547                                         break;
1548
1549                                 if (buffer_delay(bh)) {
1550                                         bh->b_blocknr = pblock;
1551                                         clear_buffer_delay(bh);
1552                                 } else if (buffer_mapped(bh)) {
1553                                         BUG_ON(bh->b_blocknr != pblock);
1554                                 }
1555
1556                                 cur_logical++;
1557                                 pblock++;
1558                         } while ((bh = bh->b_this_page) != head);
1559                 }
1560                 pagevec_release(&pvec);
1561         }
1562 }
1563
1564
1565 /*
1566  * __unmap_underlying_blocks - just a helper function to unmap
1567  * set of blocks described by @bh
1568  */
1569 static inline void __unmap_underlying_blocks(struct inode *inode,
1570                                              struct buffer_head *bh)
1571 {
1572         struct block_device *bdev = inode->i_sb->s_bdev;
1573         int blocks, i;
1574
1575         blocks = bh->b_size >> inode->i_blkbits;
1576         for (i = 0; i < blocks; i++)
1577                 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
1578 }
1579
1580 /*
1581  * mpage_da_map_blocks - go through given space
1582  *
1583  * @mpd->lbh - bh describing space
1584  * @mpd->get_block - the filesystem's block mapper function
1585  *
1586  * The function skips space we know is already mapped to disk blocks.
1587  *
1588  * The function ignores errors ->get_block() returns, thus real
1589  * error handling is postponed to __mpage_writepage()
1590  */
1591 static void mpage_da_map_blocks(struct mpage_da_data *mpd)
1592 {
1593         struct buffer_head *lbh = &mpd->lbh;
1594         int err = 0, remain = lbh->b_size;
1595         sector_t next = lbh->b_blocknr;
1596         struct buffer_head new;
1597
1598         /*
1599          * We consider only non-mapped and non-allocated blocks
1600          */
1601         if (buffer_mapped(lbh) && !buffer_delay(lbh))
1602                 return;
1603
1604         while (remain) {
1605                 new.b_state = lbh->b_state;
1606                 new.b_blocknr = 0;
1607                 new.b_size = remain;
1608                 err = mpd->get_block(mpd->inode, next, &new, 1);
1609                 if (err) {
1610                         /*
1611                          * Rather than implement own error handling
1612                          * here, we just leave remaining blocks
1613                          * unallocated and try again with ->writepage()
1614                          */
1615                         break;
1616                 }
1617                 BUG_ON(new.b_size == 0);
1618
1619                 if (buffer_new(&new))
1620                         __unmap_underlying_blocks(mpd->inode, &new);
1621
1622                 /*
1623                  * If blocks are delayed marked, we need to
1624                  * put actual blocknr and drop delayed bit
1625                  */
1626                 if (buffer_delay(lbh))
1627                         mpage_put_bnr_to_bhs(mpd, next, &new);
1628
1629                         /* go for the remaining blocks */
1630                         next += new.b_size >> mpd->inode->i_blkbits;
1631                         remain -= new.b_size;
1632                 }
1633 }
1634
1635 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
1636
1637 /*
1638  * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1639  *
1640  * @mpd->lbh - extent of blocks
1641  * @logical - logical number of the block in the file
1642  * @bh - bh of the block (used to access block's state)
1643  *
1644  * the function is used to collect contig. blocks in same state
1645  */
1646 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1647                                    sector_t logical, struct buffer_head *bh)
1648 {
1649         struct buffer_head *lbh = &mpd->lbh;
1650         sector_t next;
1651
1652         next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
1653
1654         /*
1655          * First block in the extent
1656          */
1657         if (lbh->b_size == 0) {
1658                 lbh->b_blocknr = logical;
1659                 lbh->b_size = bh->b_size;
1660                 lbh->b_state = bh->b_state & BH_FLAGS;
1661                 return;
1662         }
1663
1664         /*
1665          * Can we merge the block to our big extent?
1666          */
1667         if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
1668                 lbh->b_size += bh->b_size;
1669                 return;
1670         }
1671
1672         /*
1673          * We couldn't merge the block to our extent, so we
1674          * need to flush current  extent and start new one
1675          */
1676         mpage_da_map_blocks(mpd);
1677
1678         /*
1679          * Now start a new extent
1680          */
1681         lbh->b_size = bh->b_size;
1682         lbh->b_state = bh->b_state & BH_FLAGS;
1683         lbh->b_blocknr = logical;
1684 }
1685
1686 /*
1687  * __mpage_da_writepage - finds extent of pages and blocks
1688  *
1689  * @page: page to consider
1690  * @wbc: not used, we just follow rules
1691  * @data: context
1692  *
1693  * The function finds extents of pages and scan them for all blocks.
1694  */
1695 static int __mpage_da_writepage(struct page *page,
1696                                 struct writeback_control *wbc, void *data)
1697 {
1698         struct mpage_da_data *mpd = data;
1699         struct inode *inode = mpd->inode;
1700         struct buffer_head *bh, *head, fake;
1701         sector_t logical;
1702
1703         /*
1704          * Can we merge this page to current extent?
1705          */
1706         if (mpd->next_page != page->index) {
1707                 /*
1708                  * Nope, we can't. So, we map non-allocated blocks
1709                  * and start IO on them using __mpage_writepage()
1710                  */
1711                 if (mpd->next_page != mpd->first_page) {
1712                         mpage_da_map_blocks(mpd);
1713                         mpage_da_submit_io(mpd);
1714                 }
1715
1716                 /*
1717                  * Start next extent of pages ...
1718                  */
1719                 mpd->first_page = page->index;
1720
1721                 /*
1722                  * ... and blocks
1723                  */
1724                 mpd->lbh.b_size = 0;
1725                 mpd->lbh.b_state = 0;
1726                 mpd->lbh.b_blocknr = 0;
1727         }
1728
1729         mpd->next_page = page->index + 1;
1730         logical = (sector_t) page->index <<
1731                   (PAGE_CACHE_SHIFT - inode->i_blkbits);
1732
1733         if (!page_has_buffers(page)) {
1734                 /*
1735                  * There is no attached buffer heads yet (mmap?)
1736                  * we treat the page asfull of dirty blocks
1737                  */
1738                 bh = &fake;
1739                 bh->b_size = PAGE_CACHE_SIZE;
1740                 bh->b_state = 0;
1741                 set_buffer_dirty(bh);
1742                 set_buffer_uptodate(bh);
1743                 mpage_add_bh_to_extent(mpd, logical, bh);
1744         } else {
1745                 /*
1746                  * Page with regular buffer heads, just add all dirty ones
1747                  */
1748                 head = page_buffers(page);
1749                 bh = head;
1750                 do {
1751                         BUG_ON(buffer_locked(bh));
1752                         if (buffer_dirty(bh))
1753                                 mpage_add_bh_to_extent(mpd, logical, bh);
1754                         logical++;
1755                 } while ((bh = bh->b_this_page) != head);
1756         }
1757
1758         return 0;
1759 }
1760
1761 /*
1762  * mpage_da_writepages - walk the list of dirty pages of the given
1763  * address space, allocates non-allocated blocks, maps newly-allocated
1764  * blocks to existing bhs and issue IO them
1765  *
1766  * @mapping: address space structure to write
1767  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
1768  * @get_block: the filesystem's block mapper function.
1769  *
1770  * This is a library function, which implements the writepages()
1771  * address_space_operation.
1772  *
1773  * In order to avoid duplication of logic that deals with partial pages,
1774  * multiple bio per page, etc, we find non-allocated blocks, allocate
1775  * them with minimal calls to ->get_block() and re-use __mpage_writepage()
1776  *
1777  * It's important that we call __mpage_writepage() only once for each
1778  * involved page, otherwise we'd have to implement more complicated logic
1779  * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
1780  *
1781  * See comments to mpage_writepages()
1782  */
1783 static int mpage_da_writepages(struct address_space *mapping,
1784                                struct writeback_control *wbc,
1785                                get_block_t get_block)
1786 {
1787         struct mpage_da_data mpd;
1788         int ret;
1789
1790         if (!get_block)
1791                 return generic_writepages(mapping, wbc);
1792
1793         mpd.wbc = wbc;
1794         mpd.inode = mapping->host;
1795         mpd.lbh.b_size = 0;
1796         mpd.lbh.b_state = 0;
1797         mpd.lbh.b_blocknr = 0;
1798         mpd.first_page = 0;
1799         mpd.next_page = 0;
1800         mpd.get_block = get_block;
1801
1802         ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
1803
1804         /*
1805          * Handle last extent of pages
1806          */
1807         if (mpd.next_page != mpd.first_page) {
1808                 mpage_da_map_blocks(&mpd);
1809                 mpage_da_submit_io(&mpd);
1810         }
1811
1812         return ret;
1813 }
1814
1815 /*
1816  * this is a special callback for ->write_begin() only
1817  * it's intention is to return mapped block or reserve space
1818  */
1819 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1820                                   struct buffer_head *bh_result, int create)
1821 {
1822         int ret = 0;
1823
1824         BUG_ON(create == 0);
1825         BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
1826
1827         /*
1828          * first, we need to know whether the block is allocated already
1829          * preallocated blocks are unmapped but should treated
1830          * the same as allocated blocks.
1831          */
1832         ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0);
1833         if (ret == 0) {
1834                 /* the block isn't allocated yet, let's reserve space */
1835                 /* XXX: call reservation here */
1836                 /*
1837                  * XXX: __block_prepare_write() unmaps passed block,
1838                  * is it OK?
1839                  */
1840                 map_bh(bh_result, inode->i_sb, 0);
1841                 set_buffer_new(bh_result);
1842                 set_buffer_delay(bh_result);
1843         } else if (ret > 0) {
1844                 bh_result->b_size = (ret << inode->i_blkbits);
1845                 ret = 0;
1846         }
1847
1848         return ret;
1849 }
1850
1851 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
1852                                    struct buffer_head *bh_result, int create)
1853 {
1854         int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
1855         unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1856         loff_t disksize = EXT4_I(inode)->i_disksize;
1857         handle_t *handle = NULL;
1858
1859         if (create) {
1860                 handle = ext4_journal_start(inode, needed_blocks);
1861                 if (IS_ERR(handle)) {
1862                         ret = PTR_ERR(handle);
1863                         goto out;
1864                 }
1865         }
1866
1867         ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
1868                                    bh_result, create, 0);
1869         if (ret > 0) {
1870                 bh_result->b_size = (ret << inode->i_blkbits);
1871
1872                 /*
1873                  * Update on-disk size along with block allocation
1874                  * we don't use 'extend_disksize' as size may change
1875                  * within already allocated block -bzzz
1876                  */
1877                 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
1878                 if (disksize > i_size_read(inode))
1879                         disksize = i_size_read(inode);
1880                 if (disksize > EXT4_I(inode)->i_disksize) {
1881                         /*
1882                          * XXX: replace with spinlock if seen contended -bzzz
1883                          */
1884                         down_write(&EXT4_I(inode)->i_data_sem);
1885                         if (disksize > EXT4_I(inode)->i_disksize)
1886                                 EXT4_I(inode)->i_disksize = disksize;
1887                         up_write(&EXT4_I(inode)->i_data_sem);
1888
1889                         if (EXT4_I(inode)->i_disksize == disksize) {
1890                                 if (handle == NULL)
1891                                         handle = ext4_journal_start(inode, 1);
1892                                 if (!IS_ERR(handle))
1893                                         ext4_mark_inode_dirty(handle, inode);
1894                         }
1895                 }
1896
1897                 ret = 0;
1898         }
1899
1900 out:
1901         if (handle && !IS_ERR(handle))
1902                 ext4_journal_stop(handle);
1903
1904         return ret;
1905 }
1906 /* FIXME!! only support data=writeback mode */
1907 static int ext4_da_writepage(struct page *page,
1908                                 struct writeback_control *wbc)
1909 {
1910         struct inode *inode = page->mapping->host;
1911         handle_t *handle = NULL;
1912         int ret = 0;
1913         int err;
1914
1915         if (ext4_journal_current_handle())
1916                 goto out_fail;
1917
1918         handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1919         if (IS_ERR(handle)) {
1920                 ret = PTR_ERR(handle);
1921                 goto out_fail;
1922         }
1923
1924         if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
1925                 ret = nobh_writepage(page, ext4_get_block, wbc);
1926         else
1927                 ret = block_write_full_page(page, ext4_get_block, wbc);
1928
1929         if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
1930                 EXT4_I(inode)->i_disksize = inode->i_size;
1931                 ext4_mark_inode_dirty(handle, inode);
1932         }
1933
1934         err = ext4_journal_stop(handle);
1935         if (!ret)
1936                 ret = err;
1937         return ret;
1938
1939 out_fail:
1940         redirty_page_for_writepage(wbc, page);
1941         unlock_page(page);
1942         return ret;
1943 }
1944
1945 static int ext4_da_writepages(struct address_space *mapping,
1946                                 struct writeback_control *wbc)
1947 {
1948         return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
1949 }
1950
1951 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
1952                                 loff_t pos, unsigned len, unsigned flags,
1953                                 struct page **pagep, void **fsdata)
1954 {
1955         int ret;
1956         struct page *page;
1957         pgoff_t index;
1958         unsigned from, to;
1959         struct inode *inode = mapping->host;
1960         handle_t *handle;
1961
1962         index = pos >> PAGE_CACHE_SHIFT;
1963         from = pos & (PAGE_CACHE_SIZE - 1);
1964         to = from + len;
1965
1966         /*
1967          * With delayed allocation, we don't log the i_disksize update
1968          * if there is delayed block allocation. But we still need
1969          * to journalling the i_disksize update if writes to the end
1970          * of file which has an already mapped buffer.
1971          */
1972         handle = ext4_journal_start(inode, 1);
1973         if (IS_ERR(handle)) {
1974                 ret = PTR_ERR(handle);
1975                 goto out;
1976         }
1977
1978         page = __grab_cache_page(mapping, index);
1979         if (!page)
1980                 return -ENOMEM;
1981         *pagep = page;
1982
1983         ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1984                                                         ext4_da_get_block_prep);
1985         if (ret < 0) {
1986                 unlock_page(page);
1987                 ext4_journal_stop(handle);
1988                 page_cache_release(page);
1989         }
1990
1991 out:
1992         return ret;
1993 }
1994
1995 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
1996 {
1997         return !buffer_mapped(bh) || buffer_delay(bh);
1998 }
1999
2000 static int ext4_da_write_end(struct file *file,
2001                                 struct address_space *mapping,
2002                                 loff_t pos, unsigned len, unsigned copied,
2003                                 struct page *page, void *fsdata)
2004 {
2005         struct inode *inode = mapping->host;
2006         int ret = 0, ret2;
2007         handle_t *handle = ext4_journal_current_handle();
2008         loff_t new_i_size;
2009
2010         /*
2011          * generic_write_end() will run mark_inode_dirty() if i_size
2012          * changes.  So let's piggyback the i_disksize mark_inode_dirty
2013          * into that.
2014          */
2015
2016         new_i_size = pos + copied;
2017         if (new_i_size > EXT4_I(inode)->i_disksize)
2018                 if (!walk_page_buffers(NULL, page_buffers(page),
2019                                        0, len, NULL, ext4_bh_unmapped_or_delay)){
2020                         /*
2021                          * Updating i_disksize when extending file without
2022                          * needing block allocation
2023                          */
2024                         if (ext4_should_order_data(inode))
2025                                 ret = ext4_jbd2_file_inode(handle, inode);
2026
2027                         EXT4_I(inode)->i_disksize = new_i_size;
2028                 }
2029         ret2 = generic_write_end(file, mapping, pos, len, copied,
2030                                                         page, fsdata);
2031         copied = ret2;
2032         if (ret2 < 0)
2033                 ret = ret2;
2034         ret2 = ext4_journal_stop(handle);
2035         if (!ret)
2036                 ret = ret2;
2037
2038         return ret ? ret : copied;
2039 }
2040
2041 static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2042 {
2043         struct buffer_head *head, *bh;
2044         unsigned int curr_off = 0;
2045
2046         /*
2047          * Drop reserved blocks
2048          */
2049         BUG_ON(!PageLocked(page));
2050         if (!page_has_buffers(page))
2051                 goto out;
2052
2053         head = page_buffers(page);
2054         bh = head;
2055         do {
2056                 unsigned int next_off = curr_off + bh->b_size;
2057
2058                 /*
2059                  * is this block fully invalidated?
2060                  */
2061                 if (offset <= curr_off && buffer_delay(bh)) {
2062                         clear_buffer_delay(bh);
2063                         /* XXX: add real stuff here */
2064                 }
2065                 curr_off = next_off;
2066                 bh = bh->b_this_page;
2067         } while (bh != head);
2068
2069 out:
2070         ext4_invalidatepage(page, offset);
2071
2072         return;
2073 }
2074
2075
2076 /*
2077  * bmap() is special.  It gets used by applications such as lilo and by
2078  * the swapper to find the on-disk block of a specific piece of data.
2079  *
2080  * Naturally, this is dangerous if the block concerned is still in the
2081  * journal.  If somebody makes a swapfile on an ext4 data-journaling
2082  * filesystem and enables swap, then they may get a nasty shock when the
2083  * data getting swapped to that swapfile suddenly gets overwritten by
2084  * the original zero's written out previously to the journal and
2085  * awaiting writeback in the kernel's buffer cache.
2086  *
2087  * So, if we see any bmap calls here on a modified, data-journaled file,
2088  * take extra steps to flush any blocks which might be in the cache.
2089  */
2090 static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2091 {
2092         struct inode *inode = mapping->host;
2093         journal_t *journal;
2094         int err;
2095
2096         if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
2097                         test_opt(inode->i_sb, DELALLOC)) {
2098                 /*
2099                  * With delalloc we want to sync the file
2100                  * so that we can make sure we allocate
2101                  * blocks for file
2102                  */
2103                 filemap_write_and_wait(mapping);
2104         }
2105
2106         if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
2107                 /*
2108                  * This is a REALLY heavyweight approach, but the use of
2109                  * bmap on dirty files is expected to be extremely rare:
2110                  * only if we run lilo or swapon on a freshly made file
2111                  * do we expect this to happen.
2112                  *
2113                  * (bmap requires CAP_SYS_RAWIO so this does not
2114                  * represent an unprivileged user DOS attack --- we'd be
2115                  * in trouble if mortal users could trigger this path at
2116                  * will.)
2117                  *
2118                  * NB. EXT4_STATE_JDATA is not set on files other than
2119                  * regular files.  If somebody wants to bmap a directory
2120                  * or symlink and gets confused because the buffer
2121                  * hasn't yet been flushed to disk, they deserve
2122                  * everything they get.
2123                  */
2124
2125                 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
2126                 journal = EXT4_JOURNAL(inode);
2127                 jbd2_journal_lock_updates(journal);
2128                 err = jbd2_journal_flush(journal);
2129                 jbd2_journal_unlock_updates(journal);
2130
2131                 if (err)
2132                         return 0;
2133         }
2134
2135         return generic_block_bmap(mapping,block,ext4_get_block);
2136 }
2137
2138 static int bget_one(handle_t *handle, struct buffer_head *bh)
2139 {
2140         get_bh(bh);
2141         return 0;
2142 }
2143
2144 static int bput_one(handle_t *handle, struct buffer_head *bh)
2145 {
2146         put_bh(bh);
2147         return 0;
2148 }
2149
2150 /*
2151  * Note that we don't need to start a transaction unless we're journaling data
2152  * because we should have holes filled from ext4_page_mkwrite(). We even don't
2153  * need to file the inode to the transaction's list in ordered mode because if
2154  * we are writing back data added by write(), the inode is already there and if
2155  * we are writing back data modified via mmap(), noone guarantees in which
2156  * transaction the data will hit the disk. In case we are journaling data, we
2157  * cannot start transaction directly because transaction start ranks above page
2158  * lock so we have to do some magic.
2159  *
2160  * In all journaling modes block_write_full_page() will start the I/O.
2161  *
2162  * Problem:
2163  *
2164  *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
2165  *              ext4_writepage()
2166  *
2167  * Similar for:
2168  *
2169  *      ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
2170  *
2171  * Same applies to ext4_get_block().  We will deadlock on various things like
2172  * lock_journal and i_data_sem
2173  *
2174  * Setting PF_MEMALLOC here doesn't work - too many internal memory
2175  * allocations fail.
2176  *
2177  * 16May01: If we're reentered then journal_current_handle() will be
2178  *          non-zero. We simply *return*.
2179  *
2180  * 1 July 2001: @@@ FIXME:
2181  *   In journalled data mode, a data buffer may be metadata against the
2182  *   current transaction.  But the same file is part of a shared mapping
2183  *   and someone does a writepage() on it.
2184  *
2185  *   We will move the buffer onto the async_data list, but *after* it has
2186  *   been dirtied. So there's a small window where we have dirty data on
2187  *   BJ_Metadata.
2188  *
2189  *   Note that this only applies to the last partial page in the file.  The
2190  *   bit which block_write_full_page() uses prepare/commit for.  (That's
2191  *   broken code anyway: it's wrong for msync()).
2192  *
2193  *   It's a rare case: affects the final partial page, for journalled data
2194  *   where the file is subject to bith write() and writepage() in the same
2195  *   transction.  To fix it we'll need a custom block_write_full_page().
2196  *   We'll probably need that anyway for journalling writepage() output.
2197  *
2198  * We don't honour synchronous mounts for writepage().  That would be
2199  * disastrous.  Any write() or metadata operation will sync the fs for
2200  * us.
2201  *
2202  */
2203 static int __ext4_normal_writepage(struct page *page,
2204                                 struct writeback_control *wbc)
2205 {
2206         struct inode *inode = page->mapping->host;
2207
2208         if (test_opt(inode->i_sb, NOBH))
2209                 return nobh_writepage(page, ext4_get_block, wbc);
2210         else
2211                 return block_write_full_page(page, ext4_get_block, wbc);
2212 }
2213
2214
2215 static int ext4_normal_writepage(struct page *page,
2216                                 struct writeback_control *wbc)
2217 {
2218         struct inode *inode = page->mapping->host;
2219         loff_t size = i_size_read(inode);
2220         loff_t len;
2221
2222         J_ASSERT(PageLocked(page));
2223         J_ASSERT(page_has_buffers(page));
2224         if (page->index == size >> PAGE_CACHE_SHIFT)
2225                 len = size & ~PAGE_CACHE_MASK;
2226         else
2227                 len = PAGE_CACHE_SIZE;
2228         BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
2229                                  ext4_bh_unmapped_or_delay));
2230
2231         if (!ext4_journal_current_handle())
2232                 return __ext4_normal_writepage(page, wbc);
2233
2234         redirty_page_for_writepage(wbc, page);
2235         unlock_page(page);
2236         return 0;
2237 }
2238
2239 static int __ext4_journalled_writepage(struct page *page,
2240                                 struct writeback_control *wbc)
2241 {
2242         struct address_space *mapping = page->mapping;
2243         struct inode *inode = mapping->host;
2244         struct buffer_head *page_bufs;
2245         handle_t *handle = NULL;
2246         int ret = 0;
2247         int err;
2248
2249         ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
2250         if (ret != 0)
2251                 goto out_unlock;
2252
2253         page_bufs = page_buffers(page);
2254         walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
2255                                                                 bget_one);
2256         /* As soon as we unlock the page, it can go away, but we have
2257          * references to buffers so we are safe */
2258         unlock_page(page);
2259
2260         handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
2261         if (IS_ERR(handle)) {
2262                 ret = PTR_ERR(handle);
2263                 goto out;
2264         }
2265
2266         ret = walk_page_buffers(handle, page_bufs, 0,
2267                         PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
2268
2269         err = walk_page_buffers(handle, page_bufs, 0,
2270                                 PAGE_CACHE_SIZE, NULL, write_end_fn);
2271         if (ret == 0)
2272                 ret = err;
2273         err = ext4_journal_stop(handle);
2274         if (!ret)
2275                 ret = err;
2276
2277         walk_page_buffers(handle, page_bufs, 0,
2278                                 PAGE_CACHE_SIZE, NULL, bput_one);
2279         EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
2280         goto out;
2281
2282 out_unlock:
2283         unlock_page(page);
2284 out:
2285         return ret;
2286 }
2287
2288 static int ext4_journalled_writepage(struct page *page,
2289                                 struct writeback_control *wbc)
2290 {
2291         struct inode *inode = page->mapping->host;
2292         loff_t size = i_size_read(inode);
2293         loff_t len;
2294
2295         J_ASSERT(PageLocked(page));
2296         J_ASSERT(page_has_buffers(page));
2297         if (page->index == size >> PAGE_CACHE_SHIFT)
2298                 len = size & ~PAGE_CACHE_MASK;
2299         else
2300                 len = PAGE_CACHE_SIZE;
2301         BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
2302                                  ext4_bh_unmapped_or_delay));
2303
2304         if (ext4_journal_current_handle())
2305                 goto no_write;
2306
2307         if (PageChecked(page)) {
2308                 /*
2309                  * It's mmapped pagecache.  Add buffers and journal it.  There
2310                  * doesn't seem much point in redirtying the page here.
2311                  */
2312                 ClearPageChecked(page);
2313                 return __ext4_journalled_writepage(page, wbc);
2314         } else {
2315                 /*
2316                  * It may be a page full of checkpoint-mode buffers.  We don't
2317                  * really know unless we go poke around in the buffer_heads.
2318                  * But block_write_full_page will do the right thing.
2319                  */
2320                 return block_write_full_page(page, ext4_get_block, wbc);
2321         }
2322 no_write:
2323         redirty_page_for_writepage(wbc, page);
2324         unlock_page(page);
2325         return 0;
2326 }
2327
2328 static int ext4_readpage(struct file *file, struct page *page)
2329 {
2330         return mpage_readpage(page, ext4_get_block);
2331 }
2332
2333 static int
2334 ext4_readpages(struct file *file, struct address_space *mapping,
2335                 struct list_head *pages, unsigned nr_pages)
2336 {
2337         return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
2338 }
2339
2340 static void ext4_invalidatepage(struct page *page, unsigned long offset)
2341 {
2342         journal_t *journal = EXT4_JOURNAL(page->mapping->host);
2343
2344         /*
2345          * If it's a full truncate we just forget about the pending dirtying
2346          */
2347         if (offset == 0)
2348                 ClearPageChecked(page);
2349
2350         jbd2_journal_invalidatepage(journal, page, offset);
2351 }
2352
2353 static int ext4_releasepage(struct page *page, gfp_t wait)
2354 {
2355         journal_t *journal = EXT4_JOURNAL(page->mapping->host);
2356
2357         WARN_ON(PageChecked(page));
2358         if (!page_has_buffers(page))
2359                 return 0;
2360         return jbd2_journal_try_to_free_buffers(journal, page, wait);
2361 }
2362
2363 /*
2364  * If the O_DIRECT write will extend the file then add this inode to the
2365  * orphan list.  So recovery will truncate it back to the original size
2366  * if the machine crashes during the write.
2367  *
2368  * If the O_DIRECT write is intantiating holes inside i_size and the machine
2369  * crashes then stale disk data _may_ be exposed inside the file. But current
2370  * VFS code falls back into buffered path in that case so we are safe.
2371  */
2372 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
2373                         const struct iovec *iov, loff_t offset,
2374                         unsigned long nr_segs)
2375 {
2376         struct file *file = iocb->ki_filp;
2377         struct inode *inode = file->f_mapping->host;
2378         struct ext4_inode_info *ei = EXT4_I(inode);
2379         handle_t *handle;
2380         ssize_t ret;
2381         int orphan = 0;
2382         size_t count = iov_length(iov, nr_segs);
2383
2384         if (rw == WRITE) {
2385                 loff_t final_size = offset + count;
2386
2387                 if (final_size > inode->i_size) {
2388                         /* Credits for sb + inode write */
2389                         handle = ext4_journal_start(inode, 2);
2390                         if (IS_ERR(handle)) {
2391                                 ret = PTR_ERR(handle);
2392                                 goto out;
2393                         }
2394                         ret = ext4_orphan_add(handle, inode);
2395                         if (ret) {
2396                                 ext4_journal_stop(handle);
2397                                 goto out;
2398                         }
2399                         orphan = 1;
2400                         ei->i_disksize = inode->i_size;
2401                         ext4_journal_stop(handle);
2402                 }
2403         }
2404
2405         ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
2406                                  offset, nr_segs,
2407                                  ext4_get_block, NULL);
2408
2409         if (orphan) {
2410                 int err;
2411
2412                 /* Credits for sb + inode write */
2413                 handle = ext4_journal_start(inode, 2);
2414                 if (IS_ERR(handle)) {
2415                         /* This is really bad luck. We've written the data
2416                          * but cannot extend i_size. Bail out and pretend
2417                          * the write failed... */
2418                         ret = PTR_ERR(handle);
2419                         goto out;
2420                 }
2421                 if (inode->i_nlink)
2422                         ext4_orphan_del(handle, inode);
2423                 if (ret > 0) {
2424                         loff_t end = offset + ret;
2425                         if (end > inode->i_size) {
2426                                 ei->i_disksize = end;
2427                                 i_size_write(inode, end);
2428                                 /*
2429                                  * We're going to return a positive `ret'
2430                                  * here due to non-zero-length I/O, so there's
2431                                  * no way of reporting error returns from
2432                                  * ext4_mark_inode_dirty() to userspace.  So
2433                                  * ignore it.
2434                                  */
2435                                 ext4_mark_inode_dirty(handle, inode);
2436                         }
2437                 }
2438                 err = ext4_journal_stop(handle);
2439                 if (ret == 0)
2440                         ret = err;
2441         }
2442 out:
2443         return ret;
2444 }
2445
2446 /*
2447  * Pages can be marked dirty completely asynchronously from ext4's journalling
2448  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
2449  * much here because ->set_page_dirty is called under VFS locks.  The page is
2450  * not necessarily locked.
2451  *
2452  * We cannot just dirty the page and leave attached buffers clean, because the
2453  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
2454  * or jbddirty because all the journalling code will explode.
2455  *
2456  * So what we do is to mark the page "pending dirty" and next time writepage
2457  * is called, propagate that into the buffers appropriately.
2458  */
2459 static int ext4_journalled_set_page_dirty(struct page *page)
2460 {
2461         SetPageChecked(page);
2462         return __set_page_dirty_nobuffers(page);
2463 }
2464
2465 static const struct address_space_operations ext4_ordered_aops = {
2466         .readpage       = ext4_readpage,
2467         .readpages      = ext4_readpages,
2468         .writepage      = ext4_normal_writepage,
2469         .sync_page      = block_sync_page,
2470         .write_begin    = ext4_write_begin,
2471         .write_end      = ext4_ordered_write_end,
2472         .bmap           = ext4_bmap,
2473         .invalidatepage = ext4_invalidatepage,
2474         .releasepage    = ext4_releasepage,
2475         .direct_IO      = ext4_direct_IO,
2476         .migratepage    = buffer_migrate_page,
2477 };
2478
2479 static const struct address_space_operations ext4_writeback_aops = {
2480         .readpage       = ext4_readpage,
2481         .readpages      = ext4_readpages,
2482         .writepage      = ext4_normal_writepage,
2483         .sync_page      = block_sync_page,
2484         .write_begin    = ext4_write_begin,
2485         .write_end      = ext4_writeback_write_end,
2486         .bmap           = ext4_bmap,
2487         .invalidatepage = ext4_invalidatepage,
2488         .releasepage    = ext4_releasepage,
2489         .direct_IO      = ext4_direct_IO,
2490         .migratepage    = buffer_migrate_page,
2491 };
2492
2493 static const struct address_space_operations ext4_journalled_aops = {
2494         .readpage       = ext4_readpage,
2495         .readpages      = ext4_readpages,
2496         .writepage      = ext4_journalled_writepage,
2497         .sync_page      = block_sync_page,
2498         .write_begin    = ext4_write_begin,
2499         .write_end      = ext4_journalled_write_end,
2500         .set_page_dirty = ext4_journalled_set_page_dirty,
2501         .bmap           = ext4_bmap,
2502         .invalidatepage = ext4_invalidatepage,
2503         .releasepage    = ext4_releasepage,
2504 };
2505
2506 static const struct address_space_operations ext4_da_aops = {
2507         .readpage       = ext4_readpage,
2508         .readpages      = ext4_readpages,
2509         .writepage      = ext4_da_writepage,
2510         .writepages     = ext4_da_writepages,
2511         .sync_page      = block_sync_page,
2512         .write_begin    = ext4_da_write_begin,
2513         .write_end      = ext4_da_write_end,
2514         .bmap           = ext4_bmap,
2515         .invalidatepage = ext4_da_invalidatepage,
2516         .releasepage    = ext4_releasepage,
2517         .direct_IO      = ext4_direct_IO,
2518         .migratepage    = buffer_migrate_page,
2519 };
2520
2521 void ext4_set_aops(struct inode *inode)
2522 {
2523         if (ext4_should_order_data(inode))
2524                 inode->i_mapping->a_ops = &ext4_ordered_aops;
2525         else if (ext4_should_writeback_data(inode) &&
2526                  test_opt(inode->i_sb, DELALLOC))
2527                 inode->i_mapping->a_ops = &ext4_da_aops;
2528         else if (ext4_should_writeback_data(inode))
2529                 inode->i_mapping->a_ops = &ext4_writeback_aops;
2530         else
2531                 inode->i_mapping->a_ops = &ext4_journalled_aops;
2532 }
2533
2534 /*
2535  * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
2536  * up to the end of the block which corresponds to `from'.
2537  * This required during truncate. We need to physically zero the tail end
2538  * of that block so it doesn't yield old data if the file is later grown.
2539  */
2540 int ext4_block_truncate_page(handle_t *handle,
2541                 struct address_space *mapping, loff_t from)
2542 {
2543         ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
2544         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2545         unsigned blocksize, length, pos;
2546         ext4_lblk_t iblock;
2547         struct inode *inode = mapping->host;
2548         struct buffer_head *bh;
2549         struct page *page;
2550         int err = 0;
2551
2552         page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
2553         if (!page)
2554                 return -EINVAL;
2555
2556         blocksize = inode->i_sb->s_blocksize;
2557         length = blocksize - (offset & (blocksize - 1));
2558         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
2559
2560         /*
2561          * For "nobh" option,  we can only work if we don't need to
2562          * read-in the page - otherwise we create buffers to do the IO.
2563          */
2564         if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
2565              ext4_should_writeback_data(inode) && PageUptodate(page)) {
2566                 zero_user(page, offset, length);
2567                 set_page_dirty(page);
2568                 goto unlock;
2569         }
2570
2571         if (!page_has_buffers(page))
2572                 create_empty_buffers(page, blocksize, 0);
2573
2574         /* Find the buffer that contains "offset" */
2575         bh = page_buffers(page);
2576         pos = blocksize;
2577         while (offset >= pos) {
2578                 bh = bh->b_this_page;
2579                 iblock++;
2580                 pos += blocksize;
2581         }
2582
2583         err = 0;
2584         if (buffer_freed(bh)) {
2585                 BUFFER_TRACE(bh, "freed: skip");
2586                 goto unlock;
2587         }
2588
2589         if (!buffer_mapped(bh)) {
2590                 BUFFER_TRACE(bh, "unmapped");
2591                 ext4_get_block(inode, iblock, bh, 0);
2592                 /* unmapped? It's a hole - nothing to do */
2593                 if (!buffer_mapped(bh)) {
2594                         BUFFER_TRACE(bh, "still unmapped");
2595                         goto unlock;
2596                 }
2597         }
2598
2599         /* Ok, it's mapped. Make sure it's up-to-date */
2600         if (PageUptodate(page))
2601                 set_buffer_uptodate(bh);
2602
2603         if (!buffer_uptodate(bh)) {
2604                 err = -EIO;
2605                 ll_rw_block(READ, 1, &bh);
2606                 wait_on_buffer(bh);
2607                 /* Uhhuh. Read error. Complain and punt. */
2608                 if (!buffer_uptodate(bh))
2609                         goto unlock;
2610         }
2611
2612         if (ext4_should_journal_data(inode)) {
2613                 BUFFER_TRACE(bh, "get write access");
2614                 err = ext4_journal_get_write_access(handle, bh);
2615                 if (err)
2616                         goto unlock;
2617         }
2618
2619         zero_user(page, offset, length);
2620
2621         BUFFER_TRACE(bh, "zeroed end of block");
2622
2623         err = 0;
2624         if (ext4_should_journal_data(inode)) {
2625                 err = ext4_journal_dirty_metadata(handle, bh);
2626         } else {
2627                 if (ext4_should_order_data(inode))
2628                         err = ext4_jbd2_file_inode(handle, inode);
2629                 mark_buffer_dirty(bh);
2630         }
2631
2632 unlock:
2633         unlock_page(page);
2634         page_cache_release(page);
2635         return err;
2636 }
2637
2638 /*
2639  * Probably it should be a library function... search for first non-zero word
2640  * or memcmp with zero_page, whatever is better for particular architecture.
2641  * Linus?
2642  */
2643 static inline int all_zeroes(__le32 *p, __le32 *q)
2644 {
2645         while (p < q)
2646                 if (*p++)
2647                         return 0;
2648         return 1;
2649 }
2650
2651 /**
2652  *      ext4_find_shared - find the indirect blocks for partial truncation.
2653  *      @inode:   inode in question
2654  *      @depth:   depth of the affected branch
2655  *      @offsets: offsets of pointers in that branch (see ext4_block_to_path)
2656  *      @chain:   place to store the pointers to partial indirect blocks
2657  *      @top:     place to the (detached) top of branch
2658  *
2659  *      This is a helper function used by ext4_truncate().
2660  *
2661  *      When we do truncate() we may have to clean the ends of several
2662  *      indirect blocks but leave the blocks themselves alive. Block is
2663  *      partially truncated if some data below the new i_size is refered
2664  *      from it (and it is on the path to the first completely truncated
2665  *      data block, indeed).  We have to free the top of that path along
2666  *      with everything to the right of the path. Since no allocation
2667  *      past the truncation point is possible until ext4_truncate()
2668  *      finishes, we may safely do the latter, but top of branch may
2669  *      require special attention - pageout below the truncation point
2670  *      might try to populate it.
2671  *
2672  *      We atomically detach the top of branch from the tree, store the
2673  *      block number of its root in *@top, pointers to buffer_heads of
2674  *      partially truncated blocks - in @chain[].bh and pointers to
2675  *      their last elements that should not be removed - in
2676  *      @chain[].p. Return value is the pointer to last filled element
2677  *      of @chain.
2678  *
2679  *      The work left to caller to do the actual freeing of subtrees:
2680  *              a) free the subtree starting from *@top
2681  *              b) free the subtrees whose roots are stored in
2682  *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
2683  *              c) free the subtrees growing from the inode past the @chain[0].
2684  *                      (no partially truncated stuff there).  */
2685
2686 static Indirect *ext4_find_shared(struct inode *inode, int depth,
2687                         ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
2688 {
2689         Indirect *partial, *p;
2690         int k, err;
2691
2692         *top = 0;
2693         /* Make k index the deepest non-null offest + 1 */
2694         for (k = depth; k > 1 && !offsets[k-1]; k--)
2695                 ;
2696         partial = ext4_get_branch(inode, k, offsets, chain, &err);
2697         /* Writer: pointers */
2698         if (!partial)
2699                 partial = chain + k-1;
2700         /*
2701          * If the branch acquired continuation since we've looked at it -
2702          * fine, it should all survive and (new) top doesn't belong to us.
2703          */
2704         if (!partial->key && *partial->p)
2705                 /* Writer: end */
2706                 goto no_top;
2707         for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
2708                 ;
2709         /*
2710          * OK, we've found the last block that must survive. The rest of our
2711          * branch should be detached before unlocking. However, if that rest
2712          * of branch is all ours and does not grow immediately from the inode
2713          * it's easier to cheat and just decrement partial->p.
2714          */
2715         if (p == chain + k - 1 && p > chain) {
2716                 p->p--;
2717         } else {
2718                 *top = *p->p;
2719                 /* Nope, don't do this in ext4.  Must leave the tree intact */
2720 #if 0
2721                 *p->p = 0;
2722 #endif
2723         }
2724         /* Writer: end */
2725
2726         while(partial > p) {
2727                 brelse(partial->bh);
2728                 partial--;
2729         }
2730 no_top:
2731         return partial;
2732 }
2733
2734 /*
2735  * Zero a number of block pointers in either an inode or an indirect block.
2736  * If we restart the transaction we must again get write access to the
2737  * indirect block for further modification.
2738  *
2739  * We release `count' blocks on disk, but (last - first) may be greater
2740  * than `count' because there can be holes in there.
2741  */
2742 static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
2743                 struct buffer_head *bh, ext4_fsblk_t block_to_free,
2744                 unsigned long count, __le32 *first, __le32 *last)
2745 {
2746         __le32 *p;
2747         if (try_to_extend_transaction(handle, inode)) {
2748                 if (bh) {
2749                         BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
2750                         ext4_journal_dirty_metadata(handle, bh);
2751                 }
2752                 ext4_mark_inode_dirty(handle, inode);
2753                 ext4_journal_test_restart(handle, inode);
2754                 if (bh) {
2755                         BUFFER_TRACE(bh, "retaking write access");
2756                         ext4_journal_get_write_access(handle, bh);
2757                 }
2758         }
2759
2760         /*
2761          * Any buffers which are on the journal will be in memory. We find
2762          * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
2763          * on them.  We've already detached each block from the file, so
2764          * bforget() in jbd2_journal_forget() should be safe.
2765          *
2766          * AKPM: turn on bforget in jbd2_journal_forget()!!!
2767          */
2768         for (p = first; p < last; p++) {
2769                 u32 nr = le32_to_cpu(*p);
2770                 if (nr) {
2771                         struct buffer_head *tbh;
2772
2773                         *p = 0;
2774                         tbh = sb_find_get_block(inode->i_sb, nr);
2775                         ext4_forget(handle, 0, inode, tbh, nr);
2776                 }
2777         }
2778
2779         ext4_free_blocks(handle, inode, block_to_free, count, 0);
2780 }
2781
2782 /**
2783  * ext4_free_data - free a list of data blocks
2784  * @handle:     handle for this transaction
2785  * @inode:      inode we are dealing with
2786  * @this_bh:    indirect buffer_head which contains *@first and *@last
2787  * @first:      array of block numbers
2788  * @last:       points immediately past the end of array
2789  *
2790  * We are freeing all blocks refered from that array (numbers are stored as
2791  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
2792  *
2793  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
2794  * blocks are contiguous then releasing them at one time will only affect one
2795  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
2796  * actually use a lot of journal space.
2797  *
2798  * @this_bh will be %NULL if @first and @last point into the inode's direct
2799  * block pointers.
2800  */
2801 static void ext4_free_data(handle_t *handle, struct inode *inode,
2802                            struct buffer_head *this_bh,
2803                            __le32 *first, __le32 *last)
2804 {
2805         ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
2806         unsigned long count = 0;            /* Number of blocks in the run */
2807         __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
2808                                                corresponding to
2809                                                block_to_free */
2810         ext4_fsblk_t nr;                    /* Current block # */
2811         __le32 *p;                          /* Pointer into inode/ind
2812                                                for current block */
2813         int err;
2814
2815         if (this_bh) {                          /* For indirect block */
2816                 BUFFER_TRACE(this_bh, "get_write_access");
2817                 err = ext4_journal_get_write_access(handle, this_bh);
2818                 /* Important: if we can't update the indirect pointers
2819                  * to the blocks, we can't free them. */
2820                 if (err)
2821                         return;
2822         }
2823
2824         for (p = first; p < last; p++) {
2825                 nr = le32_to_cpu(*p);
2826                 if (nr) {
2827                         /* accumulate blocks to free if they're contiguous */
2828                         if (count == 0) {
2829                                 block_to_free = nr;
2830                                 block_to_free_p = p;
2831                                 count = 1;
2832                         } else if (nr == block_to_free + count) {
2833                                 count++;
2834                         } else {
2835                                 ext4_clear_blocks(handle, inode, this_bh,
2836                                                   block_to_free,
2837                                                   count, block_to_free_p, p);
2838                                 block_to_free = nr;
2839                                 block_to_free_p = p;
2840                                 count = 1;
2841                         }
2842                 }
2843         }
2844
2845         if (count > 0)
2846                 ext4_clear_blocks(handle, inode, this_bh, block_to_free,
2847                                   count, block_to_free_p, p);
2848
2849         if (this_bh) {
2850                 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
2851
2852                 /*
2853                  * The buffer head should have an attached journal head at this
2854                  * point. However, if the data is corrupted and an indirect
2855                  * block pointed to itself, it would have been detached when
2856                  * the block was cleared. Check for this instead of OOPSing.
2857                  */
2858                 if (bh2jh(this_bh))
2859                         ext4_journal_dirty_metadata(handle, this_bh);
2860                 else
2861                         ext4_error(inode->i_sb, __func__,
2862                                    "circular indirect block detected, "
2863                                    "inode=%lu, block=%llu",
2864                                    inode->i_ino,
2865                                    (unsigned long long) this_bh->b_blocknr);
2866         }
2867 }
2868
2869 /**
2870  *      ext4_free_branches - free an array of branches
2871  *      @handle: JBD handle for this transaction
2872  *      @inode: inode we are dealing with
2873  *      @parent_bh: the buffer_head which contains *@first and *@last
2874  *      @first: array of block numbers
2875  *      @last:  pointer immediately past the end of array
2876  *      @depth: depth of the branches to free
2877  *
2878  *      We are freeing all blocks refered from these branches (numbers are
2879  *      stored as little-endian 32-bit) and updating @inode->i_blocks
2880  *      appropriately.
2881  */
2882 static void ext4_free_branches(handle_t *handle, struct inode *inode,
2883                                struct buffer_head *parent_bh,
2884                                __le32 *first, __le32 *last, int depth)
2885 {
2886         ext4_fsblk_t nr;
2887         __le32 *p;
2888
2889         if (is_handle_aborted(handle))
2890                 return;
2891
2892         if (depth--) {
2893                 struct buffer_head *bh;
2894                 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
2895                 p = last;
2896                 while (--p >= first) {
2897                         nr = le32_to_cpu(*p);
2898                         if (!nr)
2899                                 continue;               /* A hole */
2900
2901                         /* Go read the buffer for the next level down */
2902                         bh = sb_bread(inode->i_sb, nr);
2903
2904                         /*
2905                          * A read failure? Report error and clear slot
2906                          * (should be rare).
2907                          */
2908                         if (!bh) {
2909                                 ext4_error(inode->i_sb, "ext4_free_branches",
2910                                            "Read failure, inode=%lu, block=%llu",
2911                                            inode->i_ino, nr);
2912                                 continue;
2913                         }
2914
2915                         /* This zaps the entire block.  Bottom up. */
2916                         BUFFER_TRACE(bh, "free child branches");
2917                         ext4_free_branches(handle, inode, bh,
2918                                            (__le32*)bh->b_data,
2919                                            (__le32*)bh->b_data + addr_per_block,
2920                                            depth);
2921
2922                         /*
2923                          * We've probably journalled the indirect block several
2924                          * times during the truncate.  But it's no longer
2925                          * needed and we now drop it from the transaction via
2926                          * jbd2_journal_revoke().
2927                          *
2928                          * That's easy if it's exclusively part of this
2929                          * transaction.  But if it's part of the committing
2930                          * transaction then jbd2_journal_forget() will simply
2931                          * brelse() it.  That means that if the underlying
2932                          * block is reallocated in ext4_get_block(),
2933                          * unmap_underlying_metadata() will find this block
2934                          * and will try to get rid of it.  damn, damn.
2935                          *
2936                          * If this block has already been committed to the
2937                          * journal, a revoke record will be written.  And
2938                          * revoke records must be emitted *before* clearing
2939                          * this block's bit in the bitmaps.
2940                          */
2941                         ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
2942
2943                         /*
2944                          * Everything below this this pointer has been
2945                          * released.  Now let this top-of-subtree go.
2946                          *
2947                          * We want the freeing of this indirect block to be
2948                          * atomic in the journal with the updating of the
2949                          * bitmap block which owns it.  So make some room in
2950                          * the journal.
2951                          *
2952                          * We zero the parent pointer *after* freeing its
2953                          * pointee in the bitmaps, so if extend_transaction()
2954                          * for some reason fails to put the bitmap changes and
2955                          * the release into the same transaction, recovery
2956                          * will merely complain about releasing a free block,
2957                          * rather than leaking blocks.
2958                          */
2959                         if (is_handle_aborted(handle))
2960                                 return;
2961                         if (try_to_extend_transaction(handle, inode)) {
2962                                 ext4_mark_inode_dirty(handle, inode);
2963                                 ext4_journal_test_restart(handle, inode);
2964                         }
2965
2966                         ext4_free_blocks(handle, inode, nr, 1, 1);
2967
2968                         if (parent_bh) {
2969                                 /*
2970                                  * The block which we have just freed is
2971                                  * pointed to by an indirect block: journal it
2972                                  */
2973                                 BUFFER_TRACE(parent_bh, "get_write_access");
2974                                 if (!ext4_journal_get_write_access(handle,
2975                                                                    parent_bh)){
2976                                         *p = 0;
2977                                         BUFFER_TRACE(parent_bh,
2978                                         "call ext4_journal_dirty_metadata");
2979                                         ext4_journal_dirty_metadata(handle,
2980                                                                     parent_bh);
2981                                 }
2982                         }
2983                 }
2984         } else {
2985                 /* We have reached the bottom of the tree. */
2986                 BUFFER_TRACE(parent_bh, "free data blocks");
2987                 ext4_free_data(handle, inode, parent_bh, first, last);
2988         }
2989 }
2990
2991 int ext4_can_truncate(struct inode *inode)
2992 {
2993         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2994                 return 0;
2995         if (S_ISREG(inode->i_mode))
2996                 return 1;
2997         if (S_ISDIR(inode->i_mode))
2998                 return 1;
2999         if (S_ISLNK(inode->i_mode))
3000                 return !ext4_inode_is_fast_symlink(inode);
3001         return 0;
3002 }
3003
3004 /*
3005  * ext4_truncate()
3006  *
3007  * We block out ext4_get_block() block instantiations across the entire
3008  * transaction, and VFS/VM ensures that ext4_truncate() cannot run
3009  * simultaneously on behalf of the same inode.
3010  *
3011  * As we work through the truncate and commmit bits of it to the journal there
3012  * is one core, guiding principle: the file's tree must always be consistent on
3013  * disk.  We must be able to restart the truncate after a crash.
3014  *
3015  * The file's tree may be transiently inconsistent in memory (although it
3016  * probably isn't), but whenever we close off and commit a journal transaction,
3017  * the contents of (the filesystem + the journal) must be consistent and
3018  * restartable.  It's pretty simple, really: bottom up, right to left (although
3019  * left-to-right works OK too).
3020  *
3021  * Note that at recovery time, journal replay occurs *before* the restart of
3022  * truncate against the orphan inode list.
3023  *
3024  * The committed inode has the new, desired i_size (which is the same as
3025  * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
3026  * that this inode's truncate did not complete and it will again call
3027  * ext4_truncate() to have another go.  So there will be instantiated blocks
3028  * to the right of the truncation point in a crashed ext4 filesystem.  But
3029  * that's fine - as long as they are linked from the inode, the post-crash
3030  * ext4_truncate() run will find them and release them.
3031  */
3032 void ext4_truncate(struct inode *inode)
3033 {
3034         handle_t *handle;
3035         struct ext4_inode_info *ei = EXT4_I(inode);
3036         __le32 *i_data = ei->i_data;
3037         int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
3038         struct address_space *mapping = inode->i_mapping;
3039         ext4_lblk_t offsets[4];
3040         Indirect chain[4];
3041         Indirect *partial;
3042         __le32 nr = 0;
3043         int n;
3044         ext4_lblk_t last_block;
3045         unsigned blocksize = inode->i_sb->s_blocksize;
3046
3047         if (!ext4_can_truncate(inode))
3048                 return;
3049
3050         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
3051                 ext4_ext_truncate(inode);
3052                 return;
3053         }
3054
3055         handle = start_transaction(inode);
3056         if (IS_ERR(handle))
3057                 return;         /* AKPM: return what? */
3058
3059         last_block = (inode->i_size + blocksize-1)
3060                                         >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
3061
3062         if (inode->i_size & (blocksize - 1))
3063                 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
3064                         goto out_stop;
3065
3066         n = ext4_block_to_path(inode, last_block, offsets, NULL);
3067         if (n == 0)
3068                 goto out_stop;  /* error */
3069
3070         /*
3071          * OK.  This truncate is going to happen.  We add the inode to the
3072          * orphan list, so that if this truncate spans multiple transactions,
3073          * and we crash, we will resume the truncate when the filesystem
3074          * recovers.  It also marks the inode dirty, to catch the new size.
3075          *
3076          * Implication: the file must always be in a sane, consistent
3077          * truncatable state while each transaction commits.
3078          */
3079         if (ext4_orphan_add(handle, inode))
3080                 goto out_stop;
3081
3082         /*
3083          * The orphan list entry will now protect us from any crash which
3084          * occurs before the truncate completes, so it is now safe to propagate
3085          * the new, shorter inode size (held for now in i_size) into the
3086          * on-disk inode. We do this via i_disksize, which is the value which
3087          * ext4 *really* writes onto the disk inode.
3088          */
3089         ei->i_disksize = inode->i_size;
3090
3091         /*
3092          * From here we block out all ext4_get_block() callers who want to
3093          * modify the block allocation tree.
3094          */
3095         down_write(&ei->i_data_sem);
3096
3097         if (n == 1) {           /* direct blocks */
3098                 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
3099                                i_data + EXT4_NDIR_BLOCKS);
3100                 goto do_indirects;
3101         }
3102
3103         partial = ext4_find_shared(inode, n, offsets, chain, &nr);
3104         /* Kill the top of shared branch (not detached) */
3105         if (nr) {
3106                 if (partial == chain) {
3107                         /* Shared branch grows from the inode */
3108                         ext4_free_branches(handle, inode, NULL,
3109                                            &nr, &nr+1, (chain+n-1) - partial);
3110                         *partial->p = 0;
3111                         /*
3112                          * We mark the inode dirty prior to restart,
3113                          * and prior to stop.  No need for it here.
3114                          */
3115                 } else {
3116                         /* Shared branch grows from an indirect block */
3117                         BUFFER_TRACE(partial->bh, "get_write_access");
3118                         ext4_free_branches(handle, inode, partial->bh,
3119                                         partial->p,
3120                                         partial->p+1, (chain+n-1) - partial);
3121                 }
3122         }
3123         /* Clear the ends of indirect blocks on the shared branch */
3124         while (partial > chain) {
3125                 ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
3126                                    (__le32*)partial->bh->b_data+addr_per_block,
3127                                    (chain+n-1) - partial);
3128                 BUFFER_TRACE(partial->bh, "call brelse");
3129                 brelse (partial->bh);
3130                 partial--;
3131         }
3132 do_indirects:
3133         /* Kill the remaining (whole) subtrees */
3134         switch (offsets[0]) {
3135         default:
3136                 nr = i_data[EXT4_IND_BLOCK];
3137                 if (nr) {
3138                         ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
3139                         i_data[EXT4_IND_BLOCK] = 0;
3140                 }
3141         case EXT4_IND_BLOCK:
3142                 nr = i_data[EXT4_DIND_BLOCK];
3143                 if (nr) {
3144                         ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
3145                         i_data[EXT4_DIND_BLOCK] = 0;
3146                 }
3147         case EXT4_DIND_BLOCK:
3148                 nr = i_data[EXT4_TIND_BLOCK];
3149                 if (nr) {
3150                         ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
3151                         i_data[EXT4_TIND_BLOCK] = 0;
3152                 }
3153         case EXT4_TIND_BLOCK:
3154                 ;
3155         }
3156
3157         ext4_discard_reservation(inode);
3158
3159         up_write(&ei->i_data_sem);
3160         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3161         ext4_mark_inode_dirty(handle, inode);
3162
3163         /*
3164          * In a multi-transaction truncate, we only make the final transaction
3165          * synchronous
3166          */
3167         if (IS_SYNC(inode))
3168                 handle->h_sync = 1;
3169 out_stop:
3170         /*
3171          * If this was a simple ftruncate(), and the file will remain alive
3172          * then we need to clear up the orphan record which we created above.
3173          * However, if this was a real unlink then we were called by
3174          * ext4_delete_inode(), and we allow that function to clean up the
3175          * orphan info for us.
3176          */
3177         if (inode->i_nlink)
3178                 ext4_orphan_del(handle, inode);
3179
3180         ext4_journal_stop(handle);
3181 }
3182
3183 static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
3184                 unsigned long ino, struct ext4_iloc *iloc)
3185 {
3186         ext4_group_t block_group;
3187         unsigned long offset;
3188         ext4_fsblk_t block;
3189         struct ext4_group_desc *gdp;
3190
3191         if (!ext4_valid_inum(sb, ino)) {
3192                 /*
3193                  * This error is already checked for in namei.c unless we are
3194                  * looking at an NFS filehandle, in which case no error
3195                  * report is needed
3196                  */
3197                 return 0;
3198         }
3199
3200         block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
3201         gdp = ext4_get_group_desc(sb, block_group, NULL);
3202         if (!gdp)
3203                 return 0;
3204
3205         /*
3206          * Figure out the offset within the block group inode table
3207          */
3208         offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
3209                 EXT4_INODE_SIZE(sb);
3210         block = ext4_inode_table(sb, gdp) +
3211                 (offset >> EXT4_BLOCK_SIZE_BITS(sb));
3212
3213         iloc->block_group = block_group;
3214         iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
3215         return block;
3216 }
3217
3218 /*
3219  * ext4_get_inode_loc returns with an extra refcount against the inode's
3220  * underlying buffer_head on success. If 'in_mem' is true, we have all
3221  * data in memory that is needed to recreate the on-disk version of this
3222  * inode.
3223  */
3224 static int __ext4_get_inode_loc(struct inode *inode,
3225                                 struct ext4_iloc *iloc, int in_mem)
3226 {
3227         ext4_fsblk_t block;
3228         struct buffer_head *bh;
3229
3230         block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
3231         if (!block)
3232                 return -EIO;
3233
3234         bh = sb_getblk(inode->i_sb, block);
3235         if (!bh) {
3236                 ext4_error (inode->i_sb, "ext4_get_inode_loc",
3237                                 "unable to read inode block - "
3238                                 "inode=%lu, block=%llu",
3239                                  inode->i_ino, block);
3240                 return -EIO;
3241         }
3242         if (!buffer_uptodate(bh)) {
3243                 lock_buffer(bh);
3244                 if (buffer_uptodate(bh)) {
3245                         /* someone brought it uptodate while we waited */
3246                         unlock_buffer(bh);
3247                         goto has_buffer;
3248                 }
3249
3250                 /*
3251                  * If we have all information of the inode in memory and this
3252                  * is the only valid inode in the block, we need not read the
3253                  * block.
3254                  */
3255                 if (in_mem) {
3256                         struct buffer_head *bitmap_bh;
3257                         struct ext4_group_desc *desc;
3258                         int inodes_per_buffer;
3259                         int inode_offset, i;
3260                         ext4_group_t block_group;
3261                         int start;
3262
3263                         block_group = (inode->i_ino - 1) /
3264                                         EXT4_INODES_PER_GROUP(inode->i_sb);
3265                         inodes_per_buffer = bh->b_size /
3266                                 EXT4_INODE_SIZE(inode->i_sb);
3267                         inode_offset = ((inode->i_ino - 1) %
3268                                         EXT4_INODES_PER_GROUP(inode->i_sb));
3269                         start = inode_offset & ~(inodes_per_buffer - 1);
3270
3271                         /* Is the inode bitmap in cache? */
3272                         desc = ext4_get_group_desc(inode->i_sb,
3273                                                 block_group, NULL);
3274                         if (!desc)
3275                                 goto make_io;
3276
3277                         bitmap_bh = sb_getblk(inode->i_sb,
3278                                 ext4_inode_bitmap(inode->i_sb, desc));
3279                         if (!bitmap_bh)
3280                                 goto make_io;
3281
3282                         /*
3283                          * If the inode bitmap isn't in cache then the
3284                          * optimisation may end up performing two reads instead
3285                          * of one, so skip it.
3286                          */
3287                         if (!buffer_uptodate(bitmap_bh)) {
3288                                 brelse(bitmap_bh);
3289                                 goto make_io;
3290                         }
3291                         for (i = start; i < start + inodes_per_buffer; i++) {
3292                                 if (i == inode_offset)
3293                                         continue;
3294                                 if (ext4_test_bit(i, bitmap_bh->b_data))
3295                                         break;
3296                         }
3297                         brelse(bitmap_bh);
3298                         if (i == start + inodes_per_buffer) {
3299                                 /* all other inodes are free, so skip I/O */
3300                                 memset(bh->b_data, 0, bh->b_size);
3301                                 set_buffer_uptodate(bh);
3302                                 unlock_buffer(bh);
3303                                 goto has_buffer;
3304                         }
3305                 }
3306
3307 make_io:
3308                 /*
3309                  * There are other valid inodes in the buffer, this inode
3310                  * has in-inode xattrs, or we don't have this inode in memory.
3311                  * Read the block from disk.
3312                  */
3313                 get_bh(bh);
3314                 bh->b_end_io = end_buffer_read_sync;
3315                 submit_bh(READ_META, bh);
3316                 wait_on_buffer(bh);
3317                 if (!buffer_uptodate(bh)) {
3318                         ext4_error(inode->i_sb, "ext4_get_inode_loc",
3319                                         "unable to read inode block - "
3320                                         "inode=%lu, block=%llu",
3321                                         inode->i_ino, block);
3322                         brelse(bh);
3323                         return -EIO;
3324                 }
3325         }
3326 has_buffer:
3327         iloc->bh = bh;
3328         return 0;
3329 }
3330
3331 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
3332 {
3333         /* We have all inode data except xattrs in memory here. */
3334         return __ext4_get_inode_loc(inode, iloc,
3335                 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
3336 }
3337
3338 void ext4_set_inode_flags(struct inode *inode)
3339 {
3340         unsigned int flags = EXT4_I(inode)->i_flags;
3341
3342         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
3343         if (flags & EXT4_SYNC_FL)
3344                 inode->i_flags |= S_SYNC;
3345         if (flags & EXT4_APPEND_FL)
3346                 inode->i_flags |= S_APPEND;
3347         if (flags & EXT4_IMMUTABLE_FL)
3348                 inode->i_flags |= S_IMMUTABLE;
3349         if (flags & EXT4_NOATIME_FL)
3350                 inode->i_flags |= S_NOATIME;
3351         if (flags & EXT4_DIRSYNC_FL)
3352                 inode->i_flags |= S_DIRSYNC;
3353 }
3354
3355 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
3356 void ext4_get_inode_flags(struct ext4_inode_info *ei)
3357 {
3358         unsigned int flags = ei->vfs_inode.i_flags;
3359
3360         ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
3361                         EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
3362         if (flags & S_SYNC)
3363                 ei->i_flags |= EXT4_SYNC_FL;
3364         if (flags & S_APPEND)
3365                 ei->i_flags |= EXT4_APPEND_FL;
3366         if (flags & S_IMMUTABLE)
3367                 ei->i_flags |= EXT4_IMMUTABLE_FL;
3368         if (flags & S_NOATIME)
3369                 ei->i_flags |= EXT4_NOATIME_FL;
3370         if (flags & S_DIRSYNC)
3371                 ei->i_flags |= EXT4_DIRSYNC_FL;
3372 }
3373 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
3374                                         struct ext4_inode_info *ei)
3375 {
3376         blkcnt_t i_blocks ;
3377         struct inode *inode = &(ei->vfs_inode);
3378         struct super_block *sb = inode->i_sb;
3379
3380         if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3381                                 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
3382                 /* we are using combined 48 bit field */
3383                 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
3384                                         le32_to_cpu(raw_inode->i_blocks_lo);
3385                 if (ei->i_flags & EXT4_HUGE_FILE_FL) {
3386                         /* i_blocks represent file system block size */
3387                         return i_blocks  << (inode->i_blkbits - 9);
3388                 } else {
3389                         return i_blocks;
3390                 }
3391         } else {
3392                 return le32_to_cpu(raw_inode->i_blocks_lo);
3393         }
3394 }
3395
3396 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3397 {
3398         struct ext4_iloc iloc;
3399         struct ext4_inode *raw_inode;
3400         struct ext4_inode_info *ei;
3401         struct buffer_head *bh;
3402         struct inode *inode;
3403         long ret;
3404         int block;
3405
3406         inode = iget_locked(sb, ino);
3407         if (!inode)
3408                 return ERR_PTR(-ENOMEM);
3409         if (!(inode->i_state & I_NEW))
3410                 return inode;
3411
3412         ei = EXT4_I(inode);
3413 #ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
3414         ei->i_acl = EXT4_ACL_NOT_CACHED;
3415         ei->i_default_acl = EXT4_ACL_NOT_CACHED;
3416 #endif
3417         ei->i_block_alloc_info = NULL;
3418
3419         ret = __ext4_get_inode_loc(inode, &iloc, 0);
3420         if (ret < 0)
3421                 goto bad_inode;
3422         bh = iloc.bh;
3423         raw_inode = ext4_raw_inode(&iloc);
3424         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
3425         inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
3426         inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
3427         if(!(test_opt (inode->i_sb, NO_UID32))) {
3428                 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
3429                 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
3430         }
3431         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
3432
3433         ei->i_state = 0;
3434         ei->i_dir_start_lookup = 0;
3435         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
3436         /* We now have enough fields to check if the inode was active or not.
3437          * This is needed because nfsd might try to access dead inodes
3438          * the test is that same one that e2fsck uses
3439          * NeilBrown 1999oct15
3440          */
3441         if (inode->i_nlink == 0) {
3442                 if (inode->i_mode == 0 ||
3443                     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
3444                         /* this inode is deleted */
3445                         brelse (bh);
3446                         ret = -ESTALE;
3447                         goto bad_inode;
3448                 }
3449                 /* The only unlinked inodes we let through here have
3450                  * valid i_mode and are being read by the orphan
3451                  * recovery code: that's fine, we're about to complete
3452                  * the process of deleting those. */
3453         }
3454         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
3455         inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
3456         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
3457         if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
3458             cpu_to_le32(EXT4_OS_HURD)) {
3459                 ei->i_file_acl |=
3460                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
3461         }
3462         inode->i_size = ext4_isize(raw_inode);
3463         ei->i_disksize = inode->i_size;
3464         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
3465         ei->i_block_group = iloc.block_group;
3466         /*
3467          * NOTE! The in-memory inode i_data array is in little-endian order
3468          * even on big-endian machines: we do NOT byteswap the block numbers!
3469          */
3470         for (block = 0; block < EXT4_N_BLOCKS; block++)
3471                 ei->i_data[block] = raw_inode->i_block[block];
3472         INIT_LIST_HEAD(&ei->i_orphan);
3473
3474         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
3475                 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
3476                 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
3477                     EXT4_INODE_SIZE(inode->i_sb)) {
3478                         brelse (bh);
3479                         ret = -EIO;
3480                         goto bad_inode;
3481                 }
3482                 if (ei->i_extra_isize == 0) {
3483                         /* The extra space is currently unused. Use it. */
3484                         ei->i_extra_isize = sizeof(struct ext4_inode) -
3485                                             EXT4_GOOD_OLD_INODE_SIZE;
3486                 } else {
3487                         __le32 *magic = (void *)raw_inode +
3488                                         EXT4_GOOD_OLD_INODE_SIZE +
3489                                         ei->i_extra_isize;
3490                         if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
3491                                  ei->i_state |= EXT4_STATE_XATTR;
3492                 }
3493         } else
3494                 ei->i_extra_isize = 0;
3495
3496         EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
3497         EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
3498         EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
3499         EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
3500
3501         inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
3502         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
3503                 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
3504                         inode->i_version |=
3505                         (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
3506         }
3507
3508         if (S_ISREG(inode->i_mode)) {
3509                 inode->i_op = &ext4_file_inode_operations;
3510                 inode->i_fop = &ext4_file_operations;
3511                 ext4_set_aops(inode);
3512         } else if (S_ISDIR(inode->i_mode)) {
3513                 inode->i_op = &ext4_dir_inode_operations;
3514                 inode->i_fop = &ext4_dir_operations;
3515         } else if (S_ISLNK(inode->i_mode)) {
3516                 if (ext4_inode_is_fast_symlink(inode))
3517                         inode->i_op = &ext4_fast_symlink_inode_operations;
3518                 else {
3519                         inode->i_op = &ext4_symlink_inode_operations;
3520                         ext4_set_aops(inode);
3521                 }
3522         } else {
3523                 inode->i_op = &ext4_special_inode_operations;
3524                 if (raw_inode->i_block[0])
3525                         init_special_inode(inode, inode->i_mode,
3526                            old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
3527                 else
3528                         init_special_inode(inode, inode->i_mode,
3529                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
3530         }
3531         brelse (iloc.bh);
3532         ext4_set_inode_flags(inode);
3533         unlock_new_inode(inode);
3534         return inode;
3535
3536 bad_inode:
3537         iget_failed(inode);
3538         return ERR_PTR(ret);
3539 }
3540
3541 static int ext4_inode_blocks_set(handle_t *handle,
3542                                 struct ext4_inode *raw_inode,
3543                                 struct ext4_inode_info *ei)
3544 {
3545         struct inode *inode = &(ei->vfs_inode);
3546         u64 i_blocks = inode->i_blocks;
3547         struct super_block *sb = inode->i_sb;
3548         int err = 0;
3549
3550         if (i_blocks <= ~0U) {
3551                 /*
3552                  * i_blocks can be represnted in a 32 bit variable
3553                  * as multiple of 512 bytes
3554                  */
3555                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
3556                 raw_inode->i_blocks_high = 0;
3557                 ei->i_flags &= ~EXT4_HUGE_FILE_FL;
3558         } else if (i_blocks <= 0xffffffffffffULL) {
3559                 /*
3560                  * i_blocks can be represented in a 48 bit variable
3561                  * as multiple of 512 bytes
3562                  */
3563                 err = ext4_update_rocompat_feature(handle, sb,
3564                                             EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
3565                 if (err)
3566                         goto  err_out;
3567                 /* i_block is stored in the split  48 bit fields */
3568                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
3569                 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
3570                 ei->i_flags &= ~EXT4_HUGE_FILE_FL;
3571         } else {
3572                 /*
3573                  * i_blocks should be represented in a 48 bit variable
3574                  * as multiple of  file system block size
3575                  */
3576                 err = ext4_update_rocompat_feature(handle, sb,
3577                                             EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
3578                 if (err)
3579                         goto  err_out;
3580                 ei->i_flags |= EXT4_HUGE_FILE_FL;
3581                 /* i_block is stored in file system block size */
3582                 i_blocks = i_blocks >> (inode->i_blkbits - 9);
3583                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
3584                 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
3585         }
3586 err_out:
3587         return err;
3588 }
3589
3590 /*
3591  * Post the struct inode info into an on-disk inode location in the
3592  * buffer-cache.  This gobbles the caller's reference to the
3593  * buffer_head in the inode location struct.
3594  *
3595  * The caller must have write access to iloc->bh.
3596  */
3597 static int ext4_do_update_inode(handle_t *handle,
3598                                 struct inode *inode,
3599                                 struct ext4_iloc *iloc)
3600 {
3601         struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
3602         struct ext4_inode_info *ei = EXT4_I(inode);
3603         struct buffer_head *bh = iloc->bh;
3604         int err = 0, rc, block;
3605
3606         /* For fields not not tracking in the in-memory inode,
3607          * initialise them to zero for new inodes. */
3608         if (ei->i_state & EXT4_STATE_NEW)
3609                 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
3610
3611         ext4_get_inode_flags(ei);
3612         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
3613         if(!(test_opt(inode->i_sb, NO_UID32))) {
3614                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
3615                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
3616 /*
3617  * Fix up interoperability with old kernels. Otherwise, old inodes get
3618  * re-used with the upper 16 bits of the uid/gid intact
3619  */
3620                 if(!ei->i_dtime) {
3621                         raw_inode->i_uid_high =
3622                                 cpu_to_le16(high_16_bits(inode->i_uid));
3623                         raw_inode->i_gid_high =
3624                                 cpu_to_le16(high_16_bits(inode->i_gid));
3625                 } else {
3626                         raw_inode->i_uid_high = 0;
3627                         raw_inode->i_gid_high = 0;
3628                 }
3629         } else {
3630                 raw_inode->i_uid_low =
3631                         cpu_to_le16(fs_high2lowuid(inode->i_uid));
3632                 raw_inode->i_gid_low =
3633                         cpu_to_le16(fs_high2lowgid(inode->i_gid));
3634                 raw_inode->i_uid_high = 0;
3635                 raw_inode->i_gid_high = 0;
3636         }
3637         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
3638
3639         EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
3640         EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
3641         EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
3642         EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
3643
3644         if (ext4_inode_blocks_set(handle, raw_inode, ei))
3645                 goto out_brelse;
3646         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
3647         /* clear the migrate flag in the raw_inode */
3648         raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
3649         if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
3650             cpu_to_le32(EXT4_OS_HURD))
3651                 raw_inode->i_file_acl_high =
3652                         cpu_to_le16(ei->i_file_acl >> 32);
3653         raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
3654         ext4_isize_set(raw_inode, ei->i_disksize);
3655         if (ei->i_disksize > 0x7fffffffULL) {
3656                 struct super_block *sb = inode->i_sb;
3657                 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
3658                                 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
3659                                 EXT4_SB(sb)->s_es->s_rev_level ==
3660                                 cpu_to_le32(EXT4_GOOD_OLD_REV)) {
3661                         /* If this is the first large file
3662                          * created, add a flag to the superblock.
3663                          */
3664                         err = ext4_journal_get_write_access(handle,
3665                                         EXT4_SB(sb)->s_sbh);
3666                         if (err)
3667                                 goto out_brelse;
3668                         ext4_update_dynamic_rev(sb);
3669                         EXT4_SET_RO_COMPAT_FEATURE(sb,
3670                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
3671                         sb->s_dirt = 1;
3672                         handle->h_sync = 1;
3673                         err = ext4_journal_dirty_metadata(handle,
3674                                         EXT4_SB(sb)->s_sbh);
3675                 }
3676         }
3677         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
3678         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
3679                 if (old_valid_dev(inode->i_rdev)) {
3680                         raw_inode->i_block[0] =
3681                                 cpu_to_le32(old_encode_dev(inode->i_rdev));
3682                         raw_inode->i_block[1] = 0;
3683                 } else {
3684                         raw_inode->i_block[0] = 0;
3685                         raw_inode->i_block[1] =
3686                                 cpu_to_le32(new_encode_dev(inode->i_rdev));
3687                         raw_inode->i_block[2] = 0;
3688                 }
3689         } else for (block = 0; block < EXT4_N_BLOCKS; block++)
3690                 raw_inode->i_block[block] = ei->i_data[block];
3691
3692         raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
3693         if (ei->i_extra_isize) {
3694                 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
3695                         raw_inode->i_version_hi =
3696                         cpu_to_le32(inode->i_version >> 32);
3697                 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
3698         }
3699
3700
3701         BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
3702         rc = ext4_journal_dirty_metadata(handle, bh);
3703         if (!err)
3704                 err = rc;
3705         ei->i_state &= ~EXT4_STATE_NEW;
3706
3707 out_brelse:
3708         brelse (bh);
3709         ext4_std_error(inode->i_sb, err);
3710         return err;
3711 }
3712
3713 /*
3714  * ext4_write_inode()
3715  *
3716  * We are called from a few places:
3717  *
3718  * - Within generic_file_write() for O_SYNC files.
3719  *   Here, there will be no transaction running. We wait for any running
3720  *   trasnaction to commit.
3721  *
3722  * - Within sys_sync(), kupdate and such.
3723  *   We wait on commit, if tol to.
3724  *
3725  * - Within prune_icache() (PF_MEMALLOC == true)
3726  *   Here we simply return.  We can't afford to block kswapd on the
3727  *   journal commit.
3728  *
3729  * In all cases it is actually safe for us to return without doing anything,
3730  * because the inode has been copied into a raw inode buffer in
3731  * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
3732  * knfsd.
3733  *
3734  * Note that we are absolutely dependent upon all inode dirtiers doing the
3735  * right thing: they *must* call mark_inode_dirty() after dirtying info in
3736  * which we are interested.
3737  *
3738  * It would be a bug for them to not do this.  The code:
3739  *
3740  *      mark_inode_dirty(inode)
3741  *      stuff();
3742  *      inode->i_size = expr;
3743  *
3744  * is in error because a kswapd-driven write_inode() could occur while
3745  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
3746  * will no longer be on the superblock's dirty inode list.
3747  */
3748 int ext4_write_inode(struct inode *inode, int wait)
3749 {
3750         if (current->flags & PF_MEMALLOC)
3751                 return 0;
3752
3753         if (ext4_journal_current_handle()) {
3754                 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
3755                 dump_stack();
3756                 return -EIO;
3757         }
3758
3759         if (!wait)
3760                 return 0;
3761
3762         return ext4_force_commit(inode->i_sb);
3763 }
3764
3765 /*
3766  * ext4_setattr()
3767  *
3768  * Called from notify_change.
3769  *
3770  * We want to trap VFS attempts to truncate the file as soon as
3771  * possible.  In particular, we want to make sure that when the VFS
3772  * shrinks i_size, we put the inode on the orphan list and modify
3773  * i_disksize immediately, so that during the subsequent flushing of
3774  * dirty pages and freeing of disk blocks, we can guarantee that any
3775  * commit will leave the blocks being flushed in an unused state on
3776  * disk.  (On recovery, the inode will get truncated and the blocks will
3777  * be freed, so we have a strong guarantee that no future commit will
3778  * leave these blocks visible to the user.)
3779  *
3780  * Another thing we have to assure is that if we are in ordered mode
3781  * and inode is still attached to the committing transaction, we must
3782  * we start writeout of all the dirty pages which are being truncated.
3783  * This way we are sure that all the data written in the previous
3784  * transaction are already on disk (truncate waits for pages under
3785  * writeback).
3786  *
3787  * Called with inode->i_mutex down.
3788  */
3789 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
3790 {
3791         struct inode *inode = dentry->d_inode;
3792         int error, rc = 0;
3793         const unsigned int ia_valid = attr->ia_valid;
3794
3795         error = inode_change_ok(inode, attr);
3796         if (error)
3797                 return error;
3798
3799         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
3800                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
3801                 handle_t *handle;
3802
3803                 /* (user+group)*(old+new) structure, inode write (sb,
3804                  * inode block, ? - but truncate inode update has it) */
3805                 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
3806                                         EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
3807                 if (IS_ERR(handle)) {
3808                         error = PTR_ERR(handle);
3809                         goto err_out;
3810                 }
3811                 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
3812                 if (error) {
3813                         ext4_journal_stop(handle);
3814                         return error;
3815                 }
3816                 /* Update corresponding info in inode so that everything is in
3817                  * one transaction */
3818                 if (attr->ia_valid & ATTR_UID)
3819                         inode->i_uid = attr->ia_uid;
3820                 if (attr->ia_valid & ATTR_GID)
3821                         inode->i_gid = attr->ia_gid;
3822                 error = ext4_mark_inode_dirty(handle, inode);
3823                 ext4_journal_stop(handle);
3824         }
3825
3826         if (attr->ia_valid & ATTR_SIZE) {
3827                 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
3828                         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3829
3830                         if (attr->ia_size > sbi->s_bitmap_maxbytes) {
3831                                 error = -EFBIG;
3832                                 goto err_out;
3833                         }
3834                 }
3835         }
3836
3837         if (S_ISREG(inode->i_mode) &&
3838             attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
3839                 handle_t *handle;
3840
3841                 handle = ext4_journal_start(inode, 3);
3842                 if (IS_ERR(handle)) {
3843                         error = PTR_ERR(handle);
3844                         goto err_out;
3845                 }
3846
3847                 error = ext4_orphan_add(handle, inode);
3848                 EXT4_I(inode)->i_disksize = attr->ia_size;
3849                 rc = ext4_mark_inode_dirty(handle, inode);
3850                 if (!error)
3851                         error = rc;
3852                 ext4_journal_stop(handle);
3853
3854                 if (ext4_should_order_data(inode)) {
3855                         error = ext4_begin_ordered_truncate(inode,
3856                                                             attr->ia_size);
3857                         if (error) {
3858                                 /* Do as much error cleanup as possible */
3859                                 handle = ext4_journal_start(inode, 3);
3860                                 if (IS_ERR(handle)) {
3861                                         ext4_orphan_del(NULL, inode);
3862                                         goto err_out;
3863                                 }
3864                                 ext4_orphan_del(handle, inode);
3865                                 ext4_journal_stop(handle);
3866                                 goto err_out;
3867                         }
3868                 }
3869         }
3870
3871         rc = inode_setattr(inode, attr);
3872
3873         /* If inode_setattr's call to ext4_truncate failed to get a
3874          * transaction handle at all, we need to clean up the in-core
3875          * orphan list manually. */
3876         if (inode->i_nlink)
3877                 ext4_orphan_del(NULL, inode);
3878
3879         if (!rc && (ia_valid & ATTR_MODE))
3880                 rc = ext4_acl_chmod(inode);
3881
3882 err_out:
3883         ext4_std_error(inode->i_sb, error);
3884         if (!error)
3885                 error = rc;
3886         return error;
3887 }
3888
3889
3890 /*
3891  * How many blocks doth make a writepage()?
3892  *
3893  * With N blocks per page, it may be:
3894  * N data blocks
3895  * 2 indirect block
3896  * 2 dindirect
3897  * 1 tindirect
3898  * N+5 bitmap blocks (from the above)
3899  * N+5 group descriptor summary blocks
3900  * 1 inode block
3901  * 1 superblock.
3902  * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
3903  *
3904  * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
3905  *
3906  * With ordered or writeback data it's the same, less the N data blocks.
3907  *
3908  * If the inode's direct blocks can hold an integral number of pages then a
3909  * page cannot straddle two indirect blocks, and we can only touch one indirect
3910  * and dindirect block, and the "5" above becomes "3".
3911  *
3912  * This still overestimates under most circumstances.  If we were to pass the
3913  * start and end offsets in here as well we could do block_to_path() on each
3914  * block and work out the exact number of indirects which are touched.  Pah.
3915  */
3916
3917 int ext4_writepage_trans_blocks(struct inode *inode)
3918 {
3919         int bpp = ext4_journal_blocks_per_page(inode);
3920         int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
3921         int ret;
3922
3923         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
3924                 return ext4_ext_writepage_trans_blocks(inode, bpp);
3925
3926         if (ext4_should_journal_data(inode))
3927                 ret = 3 * (bpp + indirects) + 2;
3928         else
3929                 ret = 2 * (bpp + indirects) + 2;
3930
3931 #ifdef CONFIG_QUOTA
3932         /* We know that structure was already allocated during DQUOT_INIT so
3933          * we will be updating only the data blocks + inodes */
3934         ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
3935 #endif
3936
3937         return ret;
3938 }
3939
3940 /*
3941  * The caller must have previously called ext4_reserve_inode_write().
3942  * Give this, we know that the caller already has write access to iloc->bh.
3943  */
3944 int ext4_mark_iloc_dirty(handle_t *handle,
3945                 struct inode *inode, struct ext4_iloc *iloc)
3946 {
3947         int err = 0;
3948
3949         if (test_opt(inode->i_sb, I_VERSION))
3950                 inode_inc_iversion(inode);
3951
3952         /* the do_update_inode consumes one bh->b_count */
3953         get_bh(iloc->bh);
3954
3955         /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
3956         err = ext4_do_update_inode(handle, inode, iloc);
3957         put_bh(iloc->bh);
3958         return err;
3959 }
3960
3961 /*
3962  * On success, We end up with an outstanding reference count against
3963  * iloc->bh.  This _must_ be cleaned up later.
3964  */
3965
3966 int
3967 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
3968                          struct ext4_iloc *iloc)
3969 {
3970         int err = 0;
3971         if (handle) {
3972                 err = ext4_get_inode_loc(inode, iloc);
3973                 if (!err) {
3974                         BUFFER_TRACE(iloc->bh, "get_write_access");
3975                         err = ext4_journal_get_write_access(handle, iloc->bh);
3976                         if (err) {
3977                                 brelse(iloc->bh);
3978                                 iloc->bh = NULL;
3979                         }
3980                 }
3981         }
3982         ext4_std_error(inode->i_sb, err);
3983         return err;
3984 }
3985
3986 /*
3987  * Expand an inode by new_extra_isize bytes.
3988  * Returns 0 on success or negative error number on failure.
3989  */
3990 static int ext4_expand_extra_isize(struct inode *inode,
3991                                    unsigned int new_extra_isize,
3992                                    struct ext4_iloc iloc,
3993                                    handle_t *handle)
3994 {
3995         struct ext4_inode *raw_inode;
3996         struct ext4_xattr_ibody_header *header;
3997         struct ext4_xattr_entry *entry;
3998
3999         if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
4000                 return 0;
4001
4002         raw_inode = ext4_raw_inode(&iloc);
4003
4004         header = IHDR(inode, raw_inode);
4005         entry = IFIRST(header);
4006
4007         /* No extended attributes present */
4008         if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
4009                 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
4010                 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
4011                         new_extra_isize);
4012                 EXT4_I(inode)->i_extra_isize = new_extra_isize;
4013                 return 0;
4014         }
4015
4016         /* try to expand with EAs present */
4017         return ext4_expand_extra_isize_ea(inode, new_extra_isize,
4018                                           raw_inode, handle);
4019 }
4020
4021 /*
4022  * What we do here is to mark the in-core inode as clean with respect to inode
4023  * dirtiness (it may still be data-dirty).
4024  * This means that the in-core inode may be reaped by prune_icache
4025  * without having to perform any I/O.  This is a very good thing,
4026  * because *any* task may call prune_icache - even ones which
4027  * have a transaction open against a different journal.
4028  *
4029  * Is this cheating?  Not really.  Sure, we haven't written the
4030  * inode out, but prune_icache isn't a user-visible syncing function.
4031  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
4032  * we start and wait on commits.
4033  *
4034  * Is this efficient/effective?  Well, we're being nice to the system
4035  * by cleaning up our inodes proactively so they can be reaped
4036  * without I/O.  But we are potentially leaving up to five seconds'
4037  * worth of inodes floating about which prune_icache wants us to
4038  * write out.  One way to fix that would be to get prune_icache()
4039  * to do a write_super() to free up some memory.  It has the desired
4040  * effect.
4041  */
4042 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
4043 {
4044         struct ext4_iloc iloc;
4045         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4046         static unsigned int mnt_count;
4047         int err, ret;
4048
4049         might_sleep();
4050         err = ext4_reserve_inode_write(handle, inode, &iloc);
4051         if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
4052             !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
4053                 /*
4054                  * We need extra buffer credits since we may write into EA block
4055                  * with this same handle. If journal_extend fails, then it will
4056                  * only result in a minor loss of functionality for that inode.
4057                  * If this is felt to be critical, then e2fsck should be run to
4058                  * force a large enough s_min_extra_isize.
4059                  */
4060                 if ((jbd2_journal_extend(handle,
4061                              EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
4062                         ret = ext4_expand_extra_isize(inode,
4063                                                       sbi->s_want_extra_isize,
4064                                                       iloc, handle);
4065                         if (ret) {
4066                                 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
4067                                 if (mnt_count !=
4068                                         le16_to_cpu(sbi->s_es->s_mnt_count)) {
4069                                         ext4_warning(inode->i_sb, __func__,
4070                                         "Unable to expand inode %lu. Delete"
4071                                         " some EAs or run e2fsck.",
4072                                         inode->i_ino);
4073                                         mnt_count =
4074                                           le16_to_cpu(sbi->s_es->s_mnt_count);
4075                                 }
4076                         }
4077                 }
4078         }
4079         if (!err)
4080                 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
4081         return err;
4082 }
4083
4084 /*
4085  * ext4_dirty_inode() is called from __mark_inode_dirty()
4086  *
4087  * We're really interested in the case where a file is being extended.
4088  * i_size has been changed by generic_commit_write() and we thus need
4089  * to include the updated inode in the current transaction.
4090  *
4091  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
4092  * are allocated to the file.
4093  *
4094  * If the inode is marked synchronous, we don't honour that here - doing
4095  * so would cause a commit on atime updates, which we don't bother doing.
4096  * We handle synchronous inodes at the highest possible level.
4097  */
4098 void ext4_dirty_inode(struct inode *inode)
4099 {
4100         handle_t *current_handle = ext4_journal_current_handle();
4101         handle_t *handle;
4102
4103         handle = ext4_journal_start(inode, 2);
4104         if (IS_ERR(handle))
4105                 goto out;
4106         if (current_handle &&
4107                 current_handle->h_transaction != handle->h_transaction) {
4108                 /* This task has a transaction open against a different fs */
4109                 printk(KERN_EMERG "%s: transactions do not match!\n",
4110                        __func__);
4111         } else {
4112                 jbd_debug(5, "marking dirty.  outer handle=%p\n",
4113                                 current_handle);
4114                 ext4_mark_inode_dirty(handle, inode);
4115         }
4116         ext4_journal_stop(handle);
4117 out:
4118         return;
4119 }
4120
4121 #if 0
4122 /*
4123  * Bind an inode's backing buffer_head into this transaction, to prevent
4124  * it from being flushed to disk early.  Unlike
4125  * ext4_reserve_inode_write, this leaves behind no bh reference and
4126  * returns no iloc structure, so the caller needs to repeat the iloc
4127  * lookup to mark the inode dirty later.
4128  */
4129 static int ext4_pin_inode(handle_t *handle, struct inode *inode)
4130 {
4131         struct ext4_iloc iloc;
4132
4133         int err = 0;
4134         if (handle) {
4135                 err = ext4_get_inode_loc(inode, &iloc);
4136                 if (!err) {
4137                         BUFFER_TRACE(iloc.bh, "get_write_access");
4138                         err = jbd2_journal_get_write_access(handle, iloc.bh);
4139                         if (!err)
4140                                 err = ext4_journal_dirty_metadata(handle,
4141                                                                   iloc.bh);
4142                         brelse(iloc.bh);
4143                 }
4144         }
4145         ext4_std_error(inode->i_sb, err);
4146         return err;
4147 }
4148 #endif
4149
4150 int ext4_change_inode_journal_flag(struct inode *inode, int val)
4151 {
4152         journal_t *journal;
4153         handle_t *handle;
4154         int err;
4155
4156         /*
4157          * We have to be very careful here: changing a data block's
4158          * journaling status dynamically is dangerous.  If we write a
4159          * data block to the journal, change the status and then delete
4160          * that block, we risk forgetting to revoke the old log record
4161          * from the journal and so a subsequent replay can corrupt data.
4162          * So, first we make sure that the journal is empty and that
4163          * nobody is changing anything.
4164          */
4165
4166         journal = EXT4_JOURNAL(inode);
4167         if (is_journal_aborted(journal))
4168                 return -EROFS;
4169
4170         jbd2_journal_lock_updates(journal);
4171         jbd2_journal_flush(journal);
4172
4173         /*
4174          * OK, there are no updates running now, and all cached data is
4175          * synced to disk.  We are now in a completely consistent state
4176          * which doesn't have anything in the journal, and we know that
4177          * no filesystem updates are running, so it is safe to modify
4178          * the inode's in-core data-journaling state flag now.
4179          */
4180
4181         if (val)
4182                 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
4183         else
4184                 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
4185         ext4_set_aops(inode);
4186
4187         jbd2_journal_unlock_updates(journal);
4188
4189         /* Finally we can mark the inode as dirty. */
4190
4191         handle = ext4_journal_start(inode, 1);
4192         if (IS_ERR(handle))
4193                 return PTR_ERR(handle);
4194
4195         err = ext4_mark_inode_dirty(handle, inode);
4196         handle->h_sync = 1;
4197         ext4_journal_stop(handle);
4198         ext4_std_error(inode->i_sb, err);
4199
4200         return err;
4201 }
4202
4203 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
4204 {
4205         return !buffer_mapped(bh);
4206 }
4207
4208 int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4209 {
4210         loff_t size;
4211         unsigned long len;
4212         int ret = -EINVAL;
4213         struct file *file = vma->vm_file;
4214         struct inode *inode = file->f_path.dentry->d_inode;
4215         struct address_space *mapping = inode->i_mapping;
4216
4217         /*
4218          * Get i_alloc_sem to stop truncates messing with the inode. We cannot
4219          * get i_mutex because we are already holding mmap_sem.
4220          */
4221         down_read(&inode->i_alloc_sem);
4222         size = i_size_read(inode);
4223         if (page->mapping != mapping || size <= page_offset(page)
4224             || !PageUptodate(page)) {
4225                 /* page got truncated from under us? */
4226                 goto out_unlock;
4227         }
4228         ret = 0;
4229         if (PageMappedToDisk(page))
4230                 goto out_unlock;
4231
4232         if (page->index == size >> PAGE_CACHE_SHIFT)
4233                 len = size & ~PAGE_CACHE_MASK;
4234         else
4235                 len = PAGE_CACHE_SIZE;
4236
4237         if (page_has_buffers(page)) {
4238                 /* return if we have all the buffers mapped */
4239                 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
4240                                        ext4_bh_unmapped))
4241                         goto out_unlock;
4242         }
4243         /*
4244          * OK, we need to fill the hole... Do write_begin write_end
4245          * to do block allocation/reservation.We are not holding
4246          * inode.i__mutex here. That allow * parallel write_begin,
4247          * write_end call. lock_page prevent this from happening
4248          * on the same page though
4249          */
4250         ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
4251                         len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
4252         if (ret < 0)
4253                 goto out_unlock;
4254         ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
4255                         len, len, page, NULL);
4256         if (ret < 0)
4257                 goto out_unlock;
4258         ret = 0;
4259 out_unlock:
4260         up_read(&inode->i_alloc_sem);
4261         return ret;
4262 }