fs/xfs/xfs_inode.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include <linux/log2.h>
  19
  20 #include "xfs.h"
  21 #include "xfs_fs.h"
  22 #include "xfs_format.h"
  23 #include "xfs_log.h"
  24 #include "xfs_inum.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_trans_priv.h"
  27 #include "xfs_sb.h"
  28 #include "xfs_ag.h"
  29 #include "xfs_mount.h"
  30 #include "xfs_bmap_btree.h"
  31 #include "xfs_alloc_btree.h"
  32 #include "xfs_ialloc_btree.h"
  33 #include "xfs_attr_sf.h"
  34 #include "xfs_dinode.h"
  35 #include "xfs_inode.h"
  36 #include "xfs_buf_item.h"
  37 #include "xfs_inode_item.h"
  38 #include "xfs_btree.h"
  39 #include "xfs_alloc.h"
  40 #include "xfs_ialloc.h"
  41 #include "xfs_bmap.h"
  42 #include "xfs_error.h"
  43 #include "xfs_utils.h"
  44 #include "xfs_quota.h"
  45 #include "xfs_filestream.h"
  46 #include "xfs_vnodeops.h"
  47 #include "xfs_cksum.h"
  48 #include "xfs_trace.h"
  49 #include "xfs_icache.h"
  50
  51 kmem_zone_t *xfs_inode_zone;
  52
  53 /*
  54  * Used in xfs_itruncate_extents().  This is the maximum number of extents
  55  * freed from a file in a single transaction.
  56  */
  57 #define XFS_ITRUNC_MAX_EXTENTS  2
  58
  59 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
  60
  61 /*
  62  * helper function to extract extent size hint from inode
  63  */
  64 xfs_extlen_t
  65 xfs_get_extsz_hint(
  66         struct xfs_inode        *ip)
  67 {
  68         if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
  69                 return ip->i_d.di_extsize;
  70         if (XFS_IS_REALTIME_INODE(ip))
  71                 return ip->i_mount->m_sb.sb_rextsize;
  72         return 0;
  73 }
  74
  75 /*
  76  * This is a wrapper routine around the xfs_ilock() routine used to centralize
  77  * some grungy code.  It is used in places that wish to lock the inode solely
  78  * for reading the extents.  The reason these places can't just call
  79  * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
  80  * extents from disk for a file in b-tree format.  If the inode is in b-tree
  81  * format, then we need to lock the inode exclusively until the extents are read
  82  * in.  Locking it exclusively all the time would limit our parallelism
  83  * unnecessarily, though.  What we do instead is check to see if the extents
  84  * have been read in yet, and only lock the inode exclusively if they have not.
  85  *
  86  * The function returns a value which should be given to the corresponding
  87  * xfs_iunlock_map_shared().  This value is the mode in which the lock was
  88  * actually taken.
  89  */
  90 uint
  91 xfs_ilock_map_shared(
  92         xfs_inode_t     *ip)
  93 {
  94         uint    lock_mode;
  95
  96         if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
  97             ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
  98                 lock_mode = XFS_ILOCK_EXCL;
  99         } else {
 100                 lock_mode = XFS_ILOCK_SHARED;
 101         }
 102
 103         xfs_ilock(ip, lock_mode);
 104
 105         return lock_mode;
 106 }
 107
 108 /*
 109  * This is simply the unlock routine to go with xfs_ilock_map_shared().
 110  * All it does is call xfs_iunlock() with the given lock_mode.
 111  */
 112 void
 113 xfs_iunlock_map_shared(
 114         xfs_inode_t     *ip,
 115         unsigned int    lock_mode)
 116 {
 117         xfs_iunlock(ip, lock_mode);
 118 }
 119
 120 /*
 121  * The xfs inode contains 2 locks: a multi-reader lock called the
 122  * i_iolock and a multi-reader lock called the i_lock.  This routine
 123  * allows either or both of the locks to be obtained.
 124  *
 125  * The 2 locks should always be ordered so that the IO lock is
 126  * obtained first in order to prevent deadlock.
 127  *
 128  * ip -- the inode being locked
 129  * lock_flags -- this parameter indicates the inode's locks
 130  *       to be locked.  It can be:
 131  *              XFS_IOLOCK_SHARED,
 132  *              XFS_IOLOCK_EXCL,
 133  *              XFS_ILOCK_SHARED,
 134  *              XFS_ILOCK_EXCL,
 135  *              XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
 136  *              XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
 137  *              XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
 138  *              XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
 139  */
 140 void
 141 xfs_ilock(
 142         xfs_inode_t             *ip,
 143         uint                    lock_flags)
 144 {
 145         trace_xfs_ilock(ip, lock_flags, _RET_IP_);
 146
 147         /*
 148          * You can't set both SHARED and EXCL for the same lock,
 149          * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 150          * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 151          */
 152         ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 153                (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 154         ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 155                (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 156         ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 157
 158         if (lock_flags & XFS_IOLOCK_EXCL)
 159                 mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
 160         else if (lock_flags & XFS_IOLOCK_SHARED)
 161                 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
 162
 163         if (lock_flags & XFS_ILOCK_EXCL)
 164                 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 165         else if (lock_flags & XFS_ILOCK_SHARED)
 166                 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 167 }
 168
 169 /*
 170  * This is just like xfs_ilock(), except that the caller
 171  * is guaranteed not to sleep.  It returns 1 if it gets
 172  * the requested locks and 0 otherwise.  If the IO lock is
 173  * obtained but the inode lock cannot be, then the IO lock
 174  * is dropped before returning.
 175  *
 176  * ip -- the inode being locked
 177  * lock_flags -- this parameter indicates the inode's locks to be
 178  *       to be locked.  See the comment for xfs_ilock() for a list
 179  *       of valid values.
 180  */
 181 int
 182 xfs_ilock_nowait(
 183         xfs_inode_t             *ip,
 184         uint                    lock_flags)
 185 {
 186         trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
 187
 188         /*
 189          * You can't set both SHARED and EXCL for the same lock,
 190          * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 191          * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 192          */
 193         ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 194                (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 195         ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 196                (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 197         ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 198
 199         if (lock_flags & XFS_IOLOCK_EXCL) {
 200                 if (!mrtryupdate(&ip->i_iolock))
 201                         goto out;
 202         } else if (lock_flags & XFS_IOLOCK_SHARED) {
 203                 if (!mrtryaccess(&ip->i_iolock))
 204                         goto out;
 205         }
 206         if (lock_flags & XFS_ILOCK_EXCL) {
 207                 if (!mrtryupdate(&ip->i_lock))
 208                         goto out_undo_iolock;
 209         } else if (lock_flags & XFS_ILOCK_SHARED) {
 210                 if (!mrtryaccess(&ip->i_lock))
 211                         goto out_undo_iolock;
 212         }
 213         return 1;
 214
 215  out_undo_iolock:
 216         if (lock_flags & XFS_IOLOCK_EXCL)
 217                 mrunlock_excl(&ip->i_iolock);
 218         else if (lock_flags & XFS_IOLOCK_SHARED)
 219                 mrunlock_shared(&ip->i_iolock);
 220  out:
 221         return 0;
 222 }
 223
 224 /*
 225  * xfs_iunlock() is used to drop the inode locks acquired with
 226  * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
 227  * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
 228  * that we know which locks to drop.
 229  *
 230  * ip -- the inode being unlocked
 231  * lock_flags -- this parameter indicates the inode's locks to be
 232  *       to be unlocked.  See the comment for xfs_ilock() for a list
 233  *       of valid values for this parameter.
 234  *
 235  */
 236 void
 237 xfs_iunlock(
 238         xfs_inode_t             *ip,
 239         uint                    lock_flags)
 240 {
 241         /*
 242          * You can't set both SHARED and EXCL for the same lock,
 243          * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 244          * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 245          */
 246         ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 247                (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 248         ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 249                (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 250         ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 251         ASSERT(lock_flags != 0);
 252
 253         if (lock_flags & XFS_IOLOCK_EXCL)
 254                 mrunlock_excl(&ip->i_iolock);
 255         else if (lock_flags & XFS_IOLOCK_SHARED)
 256                 mrunlock_shared(&ip->i_iolock);
 257
 258         if (lock_flags & XFS_ILOCK_EXCL)
 259                 mrunlock_excl(&ip->i_lock);
 260         else if (lock_flags & XFS_ILOCK_SHARED)
 261                 mrunlock_shared(&ip->i_lock);
 262
 263         trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 264 }
 265
 266 /*
 267  * give up write locks.  the i/o lock cannot be held nested
 268  * if it is being demoted.
 269  */
 270 void
 271 xfs_ilock_demote(
 272         xfs_inode_t             *ip,
 273         uint                    lock_flags)
 274 {
 275         ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
 276         ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
 277
 278         if (lock_flags & XFS_ILOCK_EXCL)
 279                 mrdemote(&ip->i_lock);
 280         if (lock_flags & XFS_IOLOCK_EXCL)
 281                 mrdemote(&ip->i_iolock);
 282
 283         trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 284 }
 285
 286 #if defined(DEBUG) || defined(XFS_WARN)
 287 int
 288 xfs_isilocked(
 289         xfs_inode_t             *ip,
 290         uint                    lock_flags)
 291 {
 292         if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
 293                 if (!(lock_flags & XFS_ILOCK_SHARED))
 294                         return !!ip->i_lock.mr_writer;
 295                 return rwsem_is_locked(&ip->i_lock.mr_lock);
 296         }
 297
 298         if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
 299                 if (!(lock_flags & XFS_IOLOCK_SHARED))
 300                         return !!ip->i_iolock.mr_writer;
 301                 return rwsem_is_locked(&ip->i_iolock.mr_lock);
 302         }
 303
 304         ASSERT(0);
 305         return 0;
 306 }
 307 #endif
 308
 309 void
 310 __xfs_iflock(
 311         struct xfs_inode        *ip)
 312 {
 313         wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
 314         DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
 315
 316         do {
 317                 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 318                 if (xfs_isiflocked(ip))
 319                         io_schedule();
 320         } while (!xfs_iflock_nowait(ip));
 321
 322         finish_wait(wq, &wait.wait);
 323 }
 324
 325 STATIC uint
 326 _xfs_dic2xflags(
 327         __uint16_t              di_flags)
 328 {
 329         uint                    flags = 0;
 330
 331         if (di_flags & XFS_DIFLAG_ANY) {
 332                 if (di_flags & XFS_DIFLAG_REALTIME)
 333                         flags |= XFS_XFLAG_REALTIME;
 334                 if (di_flags & XFS_DIFLAG_PREALLOC)
 335                         flags |= XFS_XFLAG_PREALLOC;
 336                 if (di_flags & XFS_DIFLAG_IMMUTABLE)
 337                         flags |= XFS_XFLAG_IMMUTABLE;
 338                 if (di_flags & XFS_DIFLAG_APPEND)
 339                         flags |= XFS_XFLAG_APPEND;
 340                 if (di_flags & XFS_DIFLAG_SYNC)
 341                         flags |= XFS_XFLAG_SYNC;
 342                 if (di_flags & XFS_DIFLAG_NOATIME)
 343                         flags |= XFS_XFLAG_NOATIME;
 344                 if (di_flags & XFS_DIFLAG_NODUMP)
 345                         flags |= XFS_XFLAG_NODUMP;
 346                 if (di_flags & XFS_DIFLAG_RTINHERIT)
 347                         flags |= XFS_XFLAG_RTINHERIT;
 348                 if (di_flags & XFS_DIFLAG_PROJINHERIT)
 349                         flags |= XFS_XFLAG_PROJINHERIT;
 350                 if (di_flags & XFS_DIFLAG_NOSYMLINKS)
 351                         flags |= XFS_XFLAG_NOSYMLINKS;
 352                 if (di_flags & XFS_DIFLAG_EXTSIZE)
 353                         flags |= XFS_XFLAG_EXTSIZE;
 354                 if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
 355                         flags |= XFS_XFLAG_EXTSZINHERIT;
 356                 if (di_flags & XFS_DIFLAG_NODEFRAG)
 357                         flags |= XFS_XFLAG_NODEFRAG;
 358                 if (di_flags & XFS_DIFLAG_FILESTREAM)
 359                         flags |= XFS_XFLAG_FILESTREAM;
 360         }
 361
 362         return flags;
 363 }
 364
 365 uint
 366 xfs_ip2xflags(
 367         xfs_inode_t             *ip)
 368 {
 369         xfs_icdinode_t          *dic = &ip->i_d;
 370
 371         return _xfs_dic2xflags(dic->di_flags) |
 372                                 (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
 373 }
 374
 375 uint
 376 xfs_dic2xflags(
 377         xfs_dinode_t            *dip)
 378 {
 379         return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
 380                                 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
 381 }
 382
 383 /*
 384  * Allocate an inode on disk and return a copy of its in-core version.
 385  * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
 386  * appropriately within the inode.  The uid and gid for the inode are
 387  * set according to the contents of the given cred structure.
 388  *
 389  * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
 390  * has a free inode available, call xfs_iget() to obtain the in-core
 391  * version of the allocated inode.  Finally, fill in the inode and
 392  * log its initial contents.  In this case, ialloc_context would be
 393  * set to NULL.
 394  *
 395  * If xfs_dialloc() does not have an available inode, it will replenish
 396  * its supply by doing an allocation. Since we can only do one
 397  * allocation within a transaction without deadlocks, we must commit
 398  * the current transaction before returning the inode itself.
 399  * In this case, therefore, we will set ialloc_context and return.
 400  * The caller should then commit the current transaction, start a new
 401  * transaction, and call xfs_ialloc() again to actually get the inode.
 402  *
 403  * To ensure that some other process does not grab the inode that
 404  * was allocated during the first call to xfs_ialloc(), this routine
 405  * also returns the [locked] bp pointing to the head of the freelist
 406  * as ialloc_context.  The caller should hold this buffer across
 407  * the commit and pass it back into this routine on the second call.
 408  *
 409  * If we are allocating quota inodes, we do not have a parent inode
 410  * to attach to or associate with (i.e. pip == NULL) because they
 411  * are not linked into the directory structure - they are attached
 412  * directly to the superblock - and so have no parent.
 413  */
 414 int
 415 xfs_ialloc(
 416         xfs_trans_t     *tp,
 417         xfs_inode_t     *pip,
 418         umode_t         mode,
 419         xfs_nlink_t     nlink,
 420         xfs_dev_t       rdev,
 421         prid_t          prid,
 422         int             okalloc,
 423         xfs_buf_t       **ialloc_context,
 424         xfs_inode_t     **ipp)
 425 {
 426         struct xfs_mount *mp = tp->t_mountp;
 427         xfs_ino_t       ino;
 428         xfs_inode_t     *ip;
 429         uint            flags;
 430         int             error;
 431         timespec_t      tv;
 432         int             filestreams = 0;
 433
 434         /*
 435          * Call the space management code to pick
 436          * the on-disk inode to be allocated.
 437          */
 438         error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
 439                             ialloc_context, &ino);
 440         if (error)
 441                 return error;
 442         if (*ialloc_context || ino == NULLFSINO) {
 443                 *ipp = NULL;
 444                 return 0;
 445         }
 446         ASSERT(*ialloc_context == NULL);
 447
 448         /*
 449          * Get the in-core inode with the lock held exclusively.
 450          * This is because we're setting fields here we need
 451          * to prevent others from looking at until we're done.
 452          */
 453         error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
 454                          XFS_ILOCK_EXCL, &ip);
 455         if (error)
 456                 return error;
 457         ASSERT(ip != NULL);
 458
 459         ip->i_d.di_mode = mode;
 460         ip->i_d.di_onlink = 0;
 461         ip->i_d.di_nlink = nlink;
 462         ASSERT(ip->i_d.di_nlink == nlink);
 463         ip->i_d.di_uid = current_fsuid();
 464         ip->i_d.di_gid = current_fsgid();
 465         xfs_set_projid(ip, prid);
 466         memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 467
 468         /*
 469          * If the superblock version is up to where we support new format
 470          * inodes and this is currently an old format inode, then change
 471          * the inode version number now.  This way we only do the conversion
 472          * here rather than here and in the flush/logging code.
 473          */
 474         if (xfs_sb_version_hasnlink(&mp->m_sb) &&
 475             ip->i_d.di_version == 1) {
 476                 ip->i_d.di_version = 2;
 477                 /*
 478                  * We've already zeroed the old link count, the projid field,
 479                  * and the pad field.
 480                  */
 481         }
 482
 483         /*
 484          * Project ids won't be stored on disk if we are using a version 1 inode.
 485          */
 486         if ((prid != 0) && (ip->i_d.di_version == 1))
 487                 xfs_bump_ino_vers2(tp, ip);
 488
 489         if (pip && XFS_INHERIT_GID(pip)) {
 490                 ip->i_d.di_gid = pip->i_d.di_gid;
 491                 if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
 492                         ip->i_d.di_mode |= S_ISGID;
 493                 }
 494         }
 495
 496         /*
 497          * If the group ID of the new file does not match the effective group
 498          * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
 499          * (and only if the irix_sgid_inherit compatibility variable is set).
 500          */
 501         if ((irix_sgid_inherit) &&
 502             (ip->i_d.di_mode & S_ISGID) &&
 503             (!in_group_p((gid_t)ip->i_d.di_gid))) {
 504                 ip->i_d.di_mode &= ~S_ISGID;
 505         }
 506
 507         ip->i_d.di_size = 0;
 508         ip->i_d.di_nextents = 0;
 509         ASSERT(ip->i_d.di_nblocks == 0);
 510
 511         nanotime(&tv);
 512         ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
 513         ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
 514         ip->i_d.di_atime = ip->i_d.di_mtime;
 515         ip->i_d.di_ctime = ip->i_d.di_mtime;
 516
 517         /*
 518          * di_gen will have been taken care of in xfs_iread.
 519          */
 520         ip->i_d.di_extsize = 0;
 521         ip->i_d.di_dmevmask = 0;
 522         ip->i_d.di_dmstate = 0;
 523         ip->i_d.di_flags = 0;
 524
 525         if (ip->i_d.di_version == 3) {
 526                 ASSERT(ip->i_d.di_ino == ino);
 527                 ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid));
 528                 ip->i_d.di_crc = 0;
 529                 ip->i_d.di_changecount = 1;
 530                 ip->i_d.di_lsn = 0;
 531                 ip->i_d.di_flags2 = 0;
 532                 memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2));
 533                 ip->i_d.di_crtime = ip->i_d.di_mtime;
 534         }
 535
 536
 537         flags = XFS_ILOG_CORE;
 538         switch (mode & S_IFMT) {
 539         case S_IFIFO:
 540         case S_IFCHR:
 541         case S_IFBLK:
 542         case S_IFSOCK:
 543                 ip->i_d.di_format = XFS_DINODE_FMT_DEV;
 544                 ip->i_df.if_u2.if_rdev = rdev;
 545                 ip->i_df.if_flags = 0;
 546                 flags |= XFS_ILOG_DEV;
 547                 break;
 548         case S_IFREG:
 549                 /*
 550                  * we can't set up filestreams until after the VFS inode
 551                  * is set up properly.
 552                  */
 553                 if (pip && xfs_inode_is_filestream(pip))
 554                         filestreams = 1;
 555                 /* fall through */
 556         case S_IFDIR:
 557                 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
 558                         uint    di_flags = 0;
 559
 560                         if (S_ISDIR(mode)) {
 561                                 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
 562                                         di_flags |= XFS_DIFLAG_RTINHERIT;
 563                                 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
 564                                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 565                                         ip->i_d.di_extsize = pip->i_d.di_extsize;
 566                                 }
 567                         } else if (S_ISREG(mode)) {
 568                                 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
 569                                         di_flags |= XFS_DIFLAG_REALTIME;
 570                                 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
 571                                         di_flags |= XFS_DIFLAG_EXTSIZE;
 572                                         ip->i_d.di_extsize = pip->i_d.di_extsize;
 573                                 }
 574                         }
 575                         if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
 576                             xfs_inherit_noatime)
 577                                 di_flags |= XFS_DIFLAG_NOATIME;
 578                         if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
 579                             xfs_inherit_nodump)
 580                                 di_flags |= XFS_DIFLAG_NODUMP;
 581                         if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
 582                             xfs_inherit_sync)
 583                                 di_flags |= XFS_DIFLAG_SYNC;
 584                         if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
 585                             xfs_inherit_nosymlinks)
 586                                 di_flags |= XFS_DIFLAG_NOSYMLINKS;
 587                         if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
 588                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
 589                         if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
 590                             xfs_inherit_nodefrag)
 591                                 di_flags |= XFS_DIFLAG_NODEFRAG;
 592                         if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
 593                                 di_flags |= XFS_DIFLAG_FILESTREAM;
 594                         ip->i_d.di_flags |= di_flags;
 595                 }
 596                 /* FALLTHROUGH */
 597         case S_IFLNK:
 598                 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
 599                 ip->i_df.if_flags = XFS_IFEXTENTS;
 600                 ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
 601                 ip->i_df.if_u1.if_extents = NULL;
 602                 break;
 603         default:
 604                 ASSERT(0);
 605         }
 606         /*
 607          * Attribute fork settings for new inode.
 608          */
 609         ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
 610         ip->i_d.di_anextents = 0;
 611
 612         /*
 613          * Log the new values stuffed into the inode.
 614          */
 615         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 616         xfs_trans_log_inode(tp, ip, flags);
 617
 618         /* now that we have an i_mode we can setup inode ops and unlock */
 619         xfs_setup_inode(ip);
 620
 621         /* now we have set up the vfs inode we can associate the filestream */
 622         if (filestreams) {
 623                 error = xfs_filestream_associate(pip, ip);
 624                 if (error < 0)
 625                         return -error;
 626                 if (!error)
 627                         xfs_iflags_set(ip, XFS_IFILESTREAM);
 628         }
 629
 630         *ipp = ip;
 631         return 0;
 632 }
 633
 634 /*
 635  * Free up the underlying blocks past new_size.  The new size must be smaller
 636  * than the current size.  This routine can be used both for the attribute and
 637  * data fork, and does not modify the inode size, which is left to the caller.
 638  *
 639  * The transaction passed to this routine must have made a permanent log
 640  * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
 641  * given transaction and start new ones, so make sure everything involved in
 642  * the transaction is tidy before calling here.  Some transaction will be
 643  * returned to the caller to be committed.  The incoming transaction must
 644  * already include the inode, and both inode locks must be held exclusively.
 645  * The inode must also be "held" within the transaction.  On return the inode
 646  * will be "held" within the returned transaction.  This routine does NOT
 647  * require any disk space to be reserved for it within the transaction.
 648  *
 649  * If we get an error, we must return with the inode locked and linked into the
 650  * current transaction. This keeps things simple for the higher level code,
 651  * because it always knows that the inode is locked and held in the transaction
 652  * that returns to it whether errors occur or not.  We don't mark the inode
 653  * dirty on error so that transactions can be easily aborted if possible.
 654  */
 655 int
 656 xfs_itruncate_extents(
 657         struct xfs_trans        **tpp,
 658         struct xfs_inode        *ip,
 659         int                     whichfork,
 660         xfs_fsize_t             new_size)
 661 {
 662         struct xfs_mount        *mp = ip->i_mount;
 663         struct xfs_trans        *tp = *tpp;
 664         struct xfs_trans        *ntp;
 665         xfs_bmap_free_t         free_list;
 666         xfs_fsblock_t           first_block;
 667         xfs_fileoff_t           first_unmap_block;
 668         xfs_fileoff_t           last_block;
 669         xfs_filblks_t           unmap_len;
 670         int                     committed;
 671         int                     error = 0;
 672         int                     done = 0;
 673
 674         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 675         ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
 676                xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 677         ASSERT(new_size <= XFS_ISIZE(ip));
 678         ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 679         ASSERT(ip->i_itemp != NULL);
 680         ASSERT(ip->i_itemp->ili_lock_flags == 0);
 681         ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
 682
 683         trace_xfs_itruncate_extents_start(ip, new_size);
 684
 685         /*
 686          * Since it is possible for space to become allocated beyond
 687          * the end of the file (in a crash where the space is allocated
 688          * but the inode size is not yet updated), simply remove any
 689          * blocks which show up between the new EOF and the maximum
 690          * possible file size.  If the first block to be removed is
 691          * beyond the maximum file size (ie it is the same as last_block),
 692          * then there is nothing to do.
 693          */
 694         first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
 695         last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
 696         if (first_unmap_block == last_block)
 697                 return 0;
 698
 699         ASSERT(first_unmap_block < last_block);
 700         unmap_len = last_block - first_unmap_block + 1;
 701         while (!done) {
 702                 xfs_bmap_init(&free_list, &first_block);
 703                 error = xfs_bunmapi(tp, ip,
 704                                     first_unmap_block, unmap_len,
 705                                     xfs_bmapi_aflag(whichfork),
 706                                     XFS_ITRUNC_MAX_EXTENTS,
 707                                     &first_block, &free_list,
 708                                     &done);
 709                 if (error)
 710                         goto out_bmap_cancel;
 711
 712                 /*
 713                  * Duplicate the transaction that has the permanent
 714                  * reservation and commit the old transaction.
 715                  */
 716                 error = xfs_bmap_finish(&tp, &free_list, &committed);
 717                 if (committed)
 718                         xfs_trans_ijoin(tp, ip, 0);
 719                 if (error)
 720                         goto out_bmap_cancel;
 721
 722                 if (committed) {
 723                         /*
 724                          * Mark the inode dirty so it will be logged and
 725                          * moved forward in the log as part of every commit.
 726                          */
 727                         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 728                 }
 729
 730                 ntp = xfs_trans_dup(tp);
 731                 error = xfs_trans_commit(tp, 0);
 732                 tp = ntp;
 733
 734                 xfs_trans_ijoin(tp, ip, 0);
 735
 736                 if (error)
 737                         goto out;
 738
 739                 /*
 740                  * Transaction commit worked ok so we can drop the extra ticket
 741                  * reference that we gained in xfs_trans_dup()
 742                  */
 743                 xfs_log_ticket_put(tp->t_ticket);
 744                 error = xfs_trans_reserve(tp, 0,
 745                                         XFS_ITRUNCATE_LOG_RES(mp), 0,
 746                                         XFS_TRANS_PERM_LOG_RES,
 747                                         XFS_ITRUNCATE_LOG_COUNT);
 748                 if (error)
 749                         goto out;
 750         }
 751
 752         /*
 753          * Always re-log the inode so that our permanent transaction can keep
 754          * on rolling it forward in the log.
 755          */
 756         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 757
 758         trace_xfs_itruncate_extents_end(ip, new_size);
 759
 760 out:
 761         *tpp = tp;
 762         return error;
 763 out_bmap_cancel:
 764         /*
 765          * If the bunmapi call encounters an error, return to the caller where
 766          * the transaction can be properly aborted.  We just need to make sure
 767          * we're not holding any resources that we were not when we came in.
 768          */
 769         xfs_bmap_cancel(&free_list);
 770         goto out;
 771 }
 772
 773 /*
 774  * This is called when the inode's link count goes to 0.
 775  * We place the on-disk inode on a list in the AGI.  It
 776  * will be pulled from this list when the inode is freed.
 777  */
 778 int
 779 xfs_iunlink(
 780         xfs_trans_t     *tp,
 781         xfs_inode_t     *ip)
 782 {
 783         xfs_mount_t     *mp;
 784         xfs_agi_t       *agi;
 785         xfs_dinode_t    *dip;
 786         xfs_buf_t       *agibp;
 787         xfs_buf_t       *ibp;
 788         xfs_agino_t     agino;
 789         short           bucket_index;
 790         int             offset;
 791         int             error;
 792
 793         ASSERT(ip->i_d.di_nlink == 0);
 794         ASSERT(ip->i_d.di_mode != 0);
 795
 796         mp = tp->t_mountp;
 797
 798         /*
 799          * Get the agi buffer first.  It ensures lock ordering
 800          * on the list.
 801          */
 802         error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
 803         if (error)
 804                 return error;
 805         agi = XFS_BUF_TO_AGI(agibp);
 806
 807         /*
 808          * Get the index into the agi hash table for the
 809          * list this inode will go on.
 810          */
 811         agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 812         ASSERT(agino != 0);
 813         bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 814         ASSERT(agi->agi_unlinked[bucket_index]);
 815         ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
 816
 817         if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
 818                 /*
 819                  * There is already another inode in the bucket we need
 820                  * to add ourselves to.  Add us at the front of the list.
 821                  * Here we put the head pointer into our next pointer,
 822                  * and then we fall through to point the head at us.
 823                  */
 824                 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
 825                                        0, 0);
 826                 if (error)
 827                         return error;
 828
 829                 ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
 830                 dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
 831                 offset = ip->i_imap.im_boffset +
 832                         offsetof(xfs_dinode_t, di_next_unlinked);
 833
 834                 /* need to recalc the inode CRC if appropriate */
 835                 xfs_dinode_calc_crc(mp, dip);
 836
 837                 xfs_trans_inode_buf(tp, ibp);
 838                 xfs_trans_log_buf(tp, ibp, offset,
 839                                   (offset + sizeof(xfs_agino_t) - 1));
 840                 xfs_inobp_check(mp, ibp);
 841         }
 842
 843         /*
 844          * Point the bucket head pointer at the inode being inserted.
 845          */
 846         ASSERT(agino != 0);
 847         agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
 848         offset = offsetof(xfs_agi_t, agi_unlinked) +
 849                 (sizeof(xfs_agino_t) * bucket_index);
 850         xfs_trans_log_buf(tp, agibp, offset,
 851                           (offset + sizeof(xfs_agino_t) - 1));
 852         return 0;
 853 }
 854
 855 /*
 856  * Pull the on-disk inode from the AGI unlinked list.
 857  */
 858 STATIC int
 859 xfs_iunlink_remove(
 860         xfs_trans_t     *tp,
 861         xfs_inode_t     *ip)
 862 {
 863         xfs_ino_t       next_ino;
 864         xfs_mount_t     *mp;
 865         xfs_agi_t       *agi;
 866         xfs_dinode_t    *dip;
 867         xfs_buf_t       *agibp;
 868         xfs_buf_t       *ibp;
 869         xfs_agnumber_t  agno;
 870         xfs_agino_t     agino;
 871         xfs_agino_t     next_agino;
 872         xfs_buf_t       *last_ibp;
 873         xfs_dinode_t    *last_dip = NULL;
 874         short           bucket_index;
 875         int             offset, last_offset = 0;
 876         int             error;
 877
 878         mp = tp->t_mountp;
 879         agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
 880
 881         /*
 882          * Get the agi buffer first.  It ensures lock ordering
 883          * on the list.
 884          */
 885         error = xfs_read_agi(mp, tp, agno, &agibp);
 886         if (error)
 887                 return error;
 888
 889         agi = XFS_BUF_TO_AGI(agibp);
 890
 891         /*
 892          * Get the index into the agi hash table for the
 893          * list this inode will go on.
 894          */
 895         agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
 896         ASSERT(agino != 0);
 897         bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
 898         ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO));
 899         ASSERT(agi->agi_unlinked[bucket_index]);
 900
 901         if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
 902                 /*
 903                  * We're at the head of the list.  Get the inode's on-disk
 904                  * buffer to see if there is anyone after us on the list.
 905                  * Only modify our next pointer if it is not already NULLAGINO.
 906                  * This saves us the overhead of dealing with the buffer when
 907                  * there is no need to change it.
 908                  */
 909                 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
 910                                        0, 0);
 911                 if (error) {
 912                         xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
 913                                 __func__, error);
 914                         return error;
 915                 }
 916                 next_agino = be32_to_cpu(dip->di_next_unlinked);
 917                 ASSERT(next_agino != 0);
 918                 if (next_agino != NULLAGINO) {
 919                         dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
 920                         offset = ip->i_imap.im_boffset +
 921                                 offsetof(xfs_dinode_t, di_next_unlinked);
 922
 923                         /* need to recalc the inode CRC if appropriate */
 924                         xfs_dinode_calc_crc(mp, dip);
 925
 926                         xfs_trans_inode_buf(tp, ibp);
 927                         xfs_trans_log_buf(tp, ibp, offset,
 928                                           (offset + sizeof(xfs_agino_t) - 1));
 929                         xfs_inobp_check(mp, ibp);
 930                 } else {
 931                         xfs_trans_brelse(tp, ibp);
 932                 }
 933                 /*
 934                  * Point the bucket head pointer at the next inode.
 935                  */
 936                 ASSERT(next_agino != 0);
 937                 ASSERT(next_agino != agino);
 938                 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
 939                 offset = offsetof(xfs_agi_t, agi_unlinked) +
 940                         (sizeof(xfs_agino_t) * bucket_index);
 941                 xfs_trans_log_buf(tp, agibp, offset,
 942                                   (offset + sizeof(xfs_agino_t) - 1));
 943         } else {
 944                 /*
 945                  * We need to search the list for the inode being freed.
 946                  */
 947                 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
 948                 last_ibp = NULL;
 949                 while (next_agino != agino) {
 950                         struct xfs_imap imap;
 951
 952                         if (last_ibp)
 953                                 xfs_trans_brelse(tp, last_ibp);
 954
 955                         imap.im_blkno = 0;
 956                         next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
 957
 958                         error = xfs_imap(mp, tp, next_ino, &imap, 0);
 959                         if (error) {
 960                                 xfs_warn(mp,
 961         "%s: xfs_imap returned error %d.",
 962                                          __func__, error);
 963                                 return error;
 964                         }
 965
 966                         error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
 967                                                &last_ibp, 0, 0);
 968                         if (error) {
 969                                 xfs_warn(mp,
 970         "%s: xfs_imap_to_bp returned error %d.",
 971                                         __func__, error);
 972                                 return error;
 973                         }
 974
 975                         last_offset = imap.im_boffset;
 976                         next_agino = be32_to_cpu(last_dip->di_next_unlinked);
 977                         ASSERT(next_agino != NULLAGINO);
 978                         ASSERT(next_agino != 0);
 979                 }
 980
 981                 /*
 982                  * Now last_ibp points to the buffer previous to us on the
 983                  * unlinked list.  Pull us from the list.
 984                  */
 985                 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
 986                                        0, 0);
 987                 if (error) {
 988                         xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
 989                                 __func__, error);
 990                         return error;
 991                 }
 992                 next_agino = be32_to_cpu(dip->di_next_unlinked);
 993                 ASSERT(next_agino != 0);
 994                 ASSERT(next_agino != agino);
 995                 if (next_agino != NULLAGINO) {
 996                         dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
 997                         offset = ip->i_imap.im_boffset +
 998                                 offsetof(xfs_dinode_t, di_next_unlinked);
 999
1000                         /* need to recalc the inode CRC if appropriate */
1001                         xfs_dinode_calc_crc(mp, dip);
1002
1003                         xfs_trans_inode_buf(tp, ibp);
1004                         xfs_trans_log_buf(tp, ibp, offset,
1005                                           (offset + sizeof(xfs_agino_t) - 1));
1006                         xfs_inobp_check(mp, ibp);
1007                 } else {
1008                         xfs_trans_brelse(tp, ibp);
1009                 }
1010                 /*
1011                  * Point the previous inode on the list to the next inode.
1012                  */
1013                 last_dip->di_next_unlinked = cpu_to_be32(next_agino);
1014                 ASSERT(next_agino != 0);
1015                 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
1016
1017                 /* need to recalc the inode CRC if appropriate */
1018                 xfs_dinode_calc_crc(mp, last_dip);
1019
1020                 xfs_trans_inode_buf(tp, last_ibp);
1021                 xfs_trans_log_buf(tp, last_ibp, offset,
1022                                   (offset + sizeof(xfs_agino_t) - 1));
1023                 xfs_inobp_check(mp, last_ibp);
1024         }
1025         return 0;
1026 }
1027
1028 /*
1029  * A big issue when freeing the inode cluster is is that we _cannot_ skip any
1030  * inodes that are in memory - they all must be marked stale and attached to
1031  * the cluster buffer.
1032  */
1033 STATIC int
1034 xfs_ifree_cluster(
1035         xfs_inode_t     *free_ip,
1036         xfs_trans_t     *tp,
1037         xfs_ino_t       inum)
1038 {
1039         xfs_mount_t             *mp = free_ip->i_mount;
1040         int                     blks_per_cluster;
1041         int                     nbufs;
1042         int                     ninodes;
1043         int                     i, j;
1044         xfs_daddr_t             blkno;
1045         xfs_buf_t               *bp;
1046         xfs_inode_t             *ip;
1047         xfs_inode_log_item_t    *iip;
1048         xfs_log_item_t          *lip;
1049         struct xfs_perag        *pag;
1050
1051         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
1052         if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
1053                 blks_per_cluster = 1;
1054                 ninodes = mp->m_sb.sb_inopblock;
1055                 nbufs = XFS_IALLOC_BLOCKS(mp);
1056         } else {
1057                 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
1058                                         mp->m_sb.sb_blocksize;
1059                 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
1060                 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
1061         }
1062
1063         for (j = 0; j < nbufs; j++, inum += ninodes) {
1064                 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
1065                                          XFS_INO_TO_AGBNO(mp, inum));
1066
1067                 /*
1068                  * We obtain and lock the backing buffer first in the process
1069                  * here, as we have to ensure that any dirty inode that we
1070                  * can't get the flush lock on is attached to the buffer.
1071                  * If we scan the in-memory inodes first, then buffer IO can
1072                  * complete before we get a lock on it, and hence we may fail
1073                  * to mark all the active inodes on the buffer stale.
1074                  */
1075                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
1076                                         mp->m_bsize * blks_per_cluster,
1077                                         XBF_UNMAPPED);
1078
1079                 if (!bp)
1080                         return ENOMEM;
1081
1082                 /*
1083                  * This buffer may not have been correctly initialised as we
1084                  * didn't read it from disk. That's not important because we are
1085                  * only using to mark the buffer as stale in the log, and to
1086                  * attach stale cached inodes on it. That means it will never be
1087                  * dispatched for IO. If it is, we want to know about it, and we
1088                  * want it to fail. We can acheive this by adding a write
1089                  * verifier to the buffer.
1090                  */
1091                  bp->b_ops = &xfs_inode_buf_ops;
1092
1093                 /*
1094                  * Walk the inodes already attached to the buffer and mark them
1095                  * stale. These will all have the flush locks held, so an
1096                  * in-memory inode walk can't lock them. By marking them all
1097                  * stale first, we will not attempt to lock them in the loop
1098                  * below as the XFS_ISTALE flag will be set.
1099                  */
1100                 lip = bp->b_fspriv;
1101                 while (lip) {
1102                         if (lip->li_type == XFS_LI_INODE) {
1103                                 iip = (xfs_inode_log_item_t *)lip;
1104                                 ASSERT(iip->ili_logged == 1);
1105                                 lip->li_cb = xfs_istale_done;
1106                                 xfs_trans_ail_copy_lsn(mp->m_ail,
1107                                                         &iip->ili_flush_lsn,
1108                                                         &iip->ili_item.li_lsn);
1109                                 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
1110                         }
1111                         lip = lip->li_bio_list;
1112                 }
1113
1114
1115                 /*
1116                  * For each inode in memory attempt to add it to the inode
1117                  * buffer and set it up for being staled on buffer IO
1118                  * completion.  This is safe as we've locked out tail pushing
1119                  * and flushing by locking the buffer.
1120                  *
1121                  * We have already marked every inode that was part of a
1122                  * transaction stale above, which means there is no point in
1123                  * even trying to lock them.
1124                  */
1125                 for (i = 0; i < ninodes; i++) {
1126 retry:
1127                         rcu_read_lock();
1128                         ip = radix_tree_lookup(&pag->pag_ici_root,
1129                                         XFS_INO_TO_AGINO(mp, (inum + i)));
1130
1131                         /* Inode not in memory, nothing to do */
1132                         if (!ip) {
1133                                 rcu_read_unlock();
1134                                 continue;
1135                         }
1136
1137                         /*
1138                          * because this is an RCU protected lookup, we could
1139                          * find a recently freed or even reallocated inode
1140                          * during the lookup. We need to check under the
1141                          * i_flags_lock for a valid inode here. Skip it if it
1142                          * is not valid, the wrong inode or stale.
1143                          */
1144                         spin_lock(&ip->i_flags_lock);
1145                         if (ip->i_ino != inum + i ||
1146                             __xfs_iflags_test(ip, XFS_ISTALE)) {
1147                                 spin_unlock(&ip->i_flags_lock);
1148                                 rcu_read_unlock();
1149                                 continue;
1150                         }
1151                         spin_unlock(&ip->i_flags_lock);
1152
1153                         /*
1154                          * Don't try to lock/unlock the current inode, but we
1155                          * _cannot_ skip the other inodes that we did not find
1156                          * in the list attached to the buffer and are not
1157                          * already marked stale. If we can't lock it, back off
1158                          * and retry.
1159                          */
1160                         if (ip != free_ip &&
1161                             !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1162                                 rcu_read_unlock();
1163                                 delay(1);
1164                                 goto retry;
1165                         }
1166                         rcu_read_unlock();
1167
1168                         xfs_iflock(ip);
1169                         xfs_iflags_set(ip, XFS_ISTALE);
1170
1171                         /*
1172                          * we don't need to attach clean inodes or those only
1173                          * with unlogged changes (which we throw away, anyway).
1174                          */
1175                         iip = ip->i_itemp;
1176                         if (!iip || xfs_inode_clean(ip)) {
1177                                 ASSERT(ip != free_ip);
1178                                 xfs_ifunlock(ip);
1179                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1180                                 continue;
1181                         }
1182
1183                         iip->ili_last_fields = iip->ili_fields;
1184                         iip->ili_fields = 0;
1185                         iip->ili_logged = 1;
1186                         xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
1187                                                 &iip->ili_item.li_lsn);
1188
1189                         xfs_buf_attach_iodone(bp, xfs_istale_done,
1190                                                   &iip->ili_item);
1191
1192                         if (ip != free_ip)
1193                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1194                 }
1195
1196                 xfs_trans_stale_inode_buf(tp, bp);
1197                 xfs_trans_binval(tp, bp);
1198         }
1199
1200         xfs_perag_put(pag);
1201         return 0;
1202 }
1203
1204 /*
1205  * This is called to return an inode to the inode free list.
1206  * The inode should already be truncated to 0 length and have
1207  * no pages associated with it.  This routine also assumes that
1208  * the inode is already a part of the transaction.
1209  *
1210  * The on-disk copy of the inode will have been added to the list
1211  * of unlinked inodes in the AGI. We need to remove the inode from
1212  * that list atomically with respect to freeing it here.
1213  */
1214 int
1215 xfs_ifree(
1216         xfs_trans_t     *tp,
1217         xfs_inode_t     *ip,
1218         xfs_bmap_free_t *flist)
1219 {
1220         int                     error;
1221         int                     delete;
1222         xfs_ino_t               first_ino;
1223
1224         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1225         ASSERT(ip->i_d.di_nlink == 0);
1226         ASSERT(ip->i_d.di_nextents == 0);
1227         ASSERT(ip->i_d.di_anextents == 0);
1228         ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
1229         ASSERT(ip->i_d.di_nblocks == 0);
1230
1231         /*
1232          * Pull the on-disk inode from the AGI unlinked list.
1233          */
1234         error = xfs_iunlink_remove(tp, ip);
1235         if (error)
1236                 return error;
1237
1238         error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
1239         if (error)
1240                 return error;
1241
1242         ip->i_d.di_mode = 0;            /* mark incore inode as free */
1243         ip->i_d.di_flags = 0;
1244         ip->i_d.di_dmevmask = 0;
1245         ip->i_d.di_forkoff = 0;         /* mark the attr fork not in use */
1246         ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1247         ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1248         /*
1249          * Bump the generation count so no one will be confused
1250          * by reincarnations of this inode.
1251          */
1252         ip->i_d.di_gen++;
1253         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1254
1255         if (delete)
1256                 error = xfs_ifree_cluster(ip, tp, first_ino);
1257
1258         return error;
1259 }
1260
1261 /*
1262  * This is called to unpin an inode.  The caller must have the inode locked
1263  * in at least shared mode so that the buffer cannot be subsequently pinned
1264  * once someone is waiting for it to be unpinned.
1265  */
1266 static void
1267 xfs_iunpin(
1268         struct xfs_inode        *ip)
1269 {
1270         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
1271
1272         trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
1273
1274         /* Give the log a push to start the unpinning I/O */
1275         xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
1276
1277 }
1278
1279 static void
1280 __xfs_iunpin_wait(
1281         struct xfs_inode        *ip)
1282 {
1283         wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
1284         DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
1285
1286         xfs_iunpin(ip);
1287
1288         do {
1289                 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
1290                 if (xfs_ipincount(ip))
1291                         io_schedule();
1292         } while (xfs_ipincount(ip));
1293         finish_wait(wq, &wait.wait);
1294 }
1295
1296 void
1297 xfs_iunpin_wait(
1298         struct xfs_inode        *ip)
1299 {
1300         if (xfs_ipincount(ip))
1301                 __xfs_iunpin_wait(ip);
1302 }
1303
1304 STATIC int
1305 xfs_iflush_cluster(
1306         xfs_inode_t     *ip,
1307         xfs_buf_t       *bp)
1308 {
1309         xfs_mount_t             *mp = ip->i_mount;
1310         struct xfs_perag        *pag;
1311         unsigned long           first_index, mask;
1312         unsigned long           inodes_per_cluster;
1313         int                     ilist_size;
1314         xfs_inode_t             **ilist;
1315         xfs_inode_t             *iq;
1316         int                     nr_found;
1317         int                     clcount = 0;
1318         int                     bufwasdelwri;
1319         int                     i;
1320
1321         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1322
1323         inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
1324         ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
1325         ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
1326         if (!ilist)
1327                 goto out_put;
1328
1329         mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
1330         first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
1331         rcu_read_lock();
1332         /* really need a gang lookup range call here */
1333         nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
1334                                         first_index, inodes_per_cluster);
1335         if (nr_found == 0)
1336                 goto out_free;
1337
1338         for (i = 0; i < nr_found; i++) {
1339                 iq = ilist[i];
1340                 if (iq == ip)
1341                         continue;
1342
1343                 /*
1344                  * because this is an RCU protected lookup, we could find a
1345                  * recently freed or even reallocated inode during the lookup.
1346                  * We need to check under the i_flags_lock for a valid inode
1347                  * here. Skip it if it is not valid or the wrong inode.
1348                  */
1349                 spin_lock(&ip->i_flags_lock);
1350                 if (!ip->i_ino ||
1351                     (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
1352                         spin_unlock(&ip->i_flags_lock);
1353                         continue;
1354                 }
1355                 spin_unlock(&ip->i_flags_lock);
1356
1357                 /*
1358                  * Do an un-protected check to see if the inode is dirty and
1359                  * is a candidate for flushing.  These checks will be repeated
1360                  * later after the appropriate locks are acquired.
1361                  */
1362                 if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
1363                         continue;
1364
1365                 /*
1366                  * Try to get locks.  If any are unavailable or it is pinned,
1367                  * then this inode cannot be flushed and is skipped.
1368                  */
1369
1370                 if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
1371                         continue;
1372                 if (!xfs_iflock_nowait(iq)) {
1373                         xfs_iunlock(iq, XFS_ILOCK_SHARED);
1374                         continue;
1375                 }
1376                 if (xfs_ipincount(iq)) {
1377                         xfs_ifunlock(iq);
1378                         xfs_iunlock(iq, XFS_ILOCK_SHARED);
1379                         continue;
1380                 }
1381
1382                 /*
1383                  * arriving here means that this inode can be flushed.  First
1384                  * re-check that it's dirty before flushing.
1385                  */
1386                 if (!xfs_inode_clean(iq)) {
1387                         int     error;
1388                         error = xfs_iflush_int(iq, bp);
1389                         if (error) {
1390                                 xfs_iunlock(iq, XFS_ILOCK_SHARED);
1391                                 goto cluster_corrupt_out;
1392                         }
1393                         clcount++;
1394                 } else {
1395                         xfs_ifunlock(iq);
1396                 }
1397                 xfs_iunlock(iq, XFS_ILOCK_SHARED);
1398         }
1399
1400         if (clcount) {
1401                 XFS_STATS_INC(xs_icluster_flushcnt);
1402                 XFS_STATS_ADD(xs_icluster_flushinode, clcount);
1403         }
1404
1405 out_free:
1406         rcu_read_unlock();
1407         kmem_free(ilist);
1408 out_put:
1409         xfs_perag_put(pag);
1410         return 0;
1411
1412
1413 cluster_corrupt_out:
1414         /*
1415          * Corruption detected in the clustering loop.  Invalidate the
1416          * inode buffer and shut down the filesystem.
1417          */
1418         rcu_read_unlock();
1419         /*
1420          * Clean up the buffer.  If it was delwri, just release it --
1421          * brelse can handle it with no problems.  If not, shut down the
1422          * filesystem before releasing the buffer.
1423          */
1424         bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
1425         if (bufwasdelwri)
1426                 xfs_buf_relse(bp);
1427
1428         xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1429
1430         if (!bufwasdelwri) {
1431                 /*
1432                  * Just like incore_relse: if we have b_iodone functions,
1433                  * mark the buffer as an error and call them.  Otherwise
1434                  * mark it as stale and brelse.
1435                  */
1436                 if (bp->b_iodone) {
1437                         XFS_BUF_UNDONE(bp);
1438                         xfs_buf_stale(bp);
1439                         xfs_buf_ioerror(bp, EIO);
1440                         xfs_buf_ioend(bp, 0);
1441                 } else {
1442                         xfs_buf_stale(bp);
1443                         xfs_buf_relse(bp);
1444                 }
1445         }
1446
1447         /*
1448          * Unlocks the flush lock
1449          */
1450         xfs_iflush_abort(iq, false);
1451         kmem_free(ilist);
1452         xfs_perag_put(pag);
1453         return XFS_ERROR(EFSCORRUPTED);
1454 }
1455
1456 /*
1457  * Flush dirty inode metadata into the backing buffer.
1458  *
1459  * The caller must have the inode lock and the inode flush lock held.  The
1460  * inode lock will still be held upon return to the caller, and the inode
1461  * flush lock will be released after the inode has reached the disk.
1462  *
1463  * The caller must write out the buffer returned in *bpp and release it.
1464  */
1465 int
1466 xfs_iflush(
1467         struct xfs_inode        *ip,
1468         struct xfs_buf          **bpp)
1469 {
1470         struct xfs_mount        *mp = ip->i_mount;
1471         struct xfs_buf          *bp;
1472         struct xfs_dinode       *dip;
1473         int                     error;
1474
1475         XFS_STATS_INC(xs_iflush_count);
1476
1477         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
1478         ASSERT(xfs_isiflocked(ip));
1479         ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
1480                ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
1481
1482         *bpp = NULL;
1483
1484         xfs_iunpin_wait(ip);
1485
1486         /*
1487          * For stale inodes we cannot rely on the backing buffer remaining
1488          * stale in cache for the remaining life of the stale inode and so
1489          * xfs_imap_to_bp() below may give us a buffer that no longer contains
1490          * inodes below. We have to check this after ensuring the inode is
1491          * unpinned so that it is safe to reclaim the stale inode after the
1492          * flush call.
1493          */
1494         if (xfs_iflags_test(ip, XFS_ISTALE)) {
1495                 xfs_ifunlock(ip);
1496                 return 0;
1497         }
1498
1499         /*
1500          * This may have been unpinned because the filesystem is shutting
1501          * down forcibly. If that's the case we must not write this inode
1502          * to disk, because the log record didn't make it to disk.
1503          *
1504          * We also have to remove the log item from the AIL in this case,
1505          * as we wait for an empty AIL as part of the unmount process.
1506          */
1507         if (XFS_FORCED_SHUTDOWN(mp)) {
1508                 error = XFS_ERROR(EIO);
1509                 goto abort_out;
1510         }
1511
1512         /*
1513          * Get the buffer containing the on-disk inode.
1514          */
1515         error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
1516                                0);
1517         if (error || !bp) {
1518                 xfs_ifunlock(ip);
1519                 return error;
1520         }
1521
1522         /*
1523          * First flush out the inode that xfs_iflush was called with.
1524          */
1525         error = xfs_iflush_int(ip, bp);
1526         if (error)
1527                 goto corrupt_out;
1528
1529         /*
1530          * If the buffer is pinned then push on the log now so we won't
1531          * get stuck waiting in the write for too long.
1532          */
1533         if (xfs_buf_ispinned(bp))
1534                 xfs_log_force(mp, 0);
1535
1536         /*
1537          * inode clustering:
1538          * see if other inodes can be gathered into this write
1539          */
1540         error = xfs_iflush_cluster(ip, bp);
1541         if (error)
1542                 goto cluster_corrupt_out;
1543
1544         *bpp = bp;
1545         return 0;
1546
1547 corrupt_out:
1548         xfs_buf_relse(bp);
1549         xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1550 cluster_corrupt_out:
1551         error = XFS_ERROR(EFSCORRUPTED);
1552 abort_out:
1553         /*
1554          * Unlocks the flush lock
1555          */
1556         xfs_iflush_abort(ip, false);
1557         return error;
1558 }
1559
1560
1561 STATIC int
1562 xfs_iflush_int(
1563         struct xfs_inode        *ip,
1564         struct xfs_buf          *bp)
1565 {
1566         struct xfs_inode_log_item *iip = ip->i_itemp;
1567         struct xfs_dinode       *dip;
1568         struct xfs_mount        *mp = ip->i_mount;
1569
1570         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
1571         ASSERT(xfs_isiflocked(ip));
1572         ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
1573                ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
1574         ASSERT(iip != NULL && iip->ili_fields != 0);
1575
1576         /* set *dip = inode's place in the buffer */
1577         dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
1578
1579         if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
1580                                mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
1581                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
1582                         "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
1583                         __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
1584                 goto corrupt_out;
1585         }
1586         if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
1587                                 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
1588                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
1589                         "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
1590                         __func__, ip->i_ino, ip, ip->i_d.di_magic);
1591                 goto corrupt_out;
1592         }
1593         if (S_ISREG(ip->i_d.di_mode)) {
1594                 if (XFS_TEST_ERROR(
1595                     (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
1596                     (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
1597                     mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
1598                         xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
1599                                 "%s: Bad regular inode %Lu, ptr 0x%p",
1600                                 __func__, ip->i_ino, ip);
1601                         goto corrupt_out;
1602                 }
1603         } else if (S_ISDIR(ip->i_d.di_mode)) {
1604                 if (XFS_TEST_ERROR(
1605                     (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
1606                     (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
1607                     (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
1608                     mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
1609                         xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
1610                                 "%s: Bad directory inode %Lu, ptr 0x%p",
1611                                 __func__, ip->i_ino, ip);
1612                         goto corrupt_out;
1613                 }
1614         }
1615         if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
1616                                 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
1617                                 XFS_RANDOM_IFLUSH_5)) {
1618                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
1619                         "%s: detected corrupt incore inode %Lu, "
1620                         "total extents = %d, nblocks = %Ld, ptr 0x%p",
1621                         __func__, ip->i_ino,
1622                         ip->i_d.di_nextents + ip->i_d.di_anextents,
1623                         ip->i_d.di_nblocks, ip);
1624                 goto corrupt_out;
1625         }
1626         if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
1627                                 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
1628                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
1629                         "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
1630                         __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
1631                 goto corrupt_out;
1632         }
1633
1634         /*
1635          * Inode item log recovery for v1/v2 inodes are dependent on the
1636          * di_flushiter count for correct sequencing. We bump the flush
1637          * iteration count so we can detect flushes which postdate a log record
1638          * during recovery. This is redundant as we now log every change and
1639          * hence this can't happen but we need to still do it to ensure
1640          * backwards compatibility with old kernels that predate logging all
1641          * inode changes.
1642          */
1643         if (ip->i_d.di_version < 3)
1644                 ip->i_d.di_flushiter++;
1645
1646         /*
1647          * Copy the dirty parts of the inode into the on-disk
1648          * inode.  We always copy out the core of the inode,
1649          * because if the inode is dirty at all the core must
1650          * be.
1651          */
1652         xfs_dinode_to_disk(dip, &ip->i_d);
1653
1654         /* Wrap, we never let the log put out DI_MAX_FLUSH */
1655         if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
1656                 ip->i_d.di_flushiter = 0;
1657
1658         /*
1659          * If this is really an old format inode and the superblock version
1660          * has not been updated to support only new format inodes, then
1661          * convert back to the old inode format.  If the superblock version
1662          * has been updated, then make the conversion permanent.
1663          */
1664         ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
1665         if (ip->i_d.di_version == 1) {
1666                 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
1667                         /*
1668                          * Convert it back.
1669                          */
1670                         ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
1671                         dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
1672                 } else {
1673                         /*
1674                          * The superblock version has already been bumped,
1675                          * so just make the conversion to the new inode
1676                          * format permanent.
1677                          */
1678                         ip->i_d.di_version = 2;
1679                         dip->di_version = 2;
1680                         ip->i_d.di_onlink = 0;
1681                         dip->di_onlink = 0;
1682                         memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1683                         memset(&(dip->di_pad[0]), 0,
1684                               sizeof(dip->di_pad));
1685                         ASSERT(xfs_get_projid(ip) == 0);
1686                 }
1687         }
1688
1689         xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
1690         if (XFS_IFORK_Q(ip))
1691                 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
1692         xfs_inobp_check(mp, bp);
1693
1694         /*
1695          * We've recorded everything logged in the inode, so we'd like to clear
1696          * the ili_fields bits so we don't log and flush things unnecessarily.
1697          * However, we can't stop logging all this information until the data
1698          * we've copied into the disk buffer is written to disk.  If we did we
1699          * might overwrite the copy of the inode in the log with all the data
1700          * after re-logging only part of it, and in the face of a crash we
1701          * wouldn't have all the data we need to recover.
1702          *
1703          * What we do is move the bits to the ili_last_fields field.  When
1704          * logging the inode, these bits are moved back to the ili_fields field.
1705          * In the xfs_iflush_done() routine we clear ili_last_fields, since we
1706          * know that the information those bits represent is permanently on
1707          * disk.  As long as the flush completes before the inode is logged
1708          * again, then both ili_fields and ili_last_fields will be cleared.
1709          *
1710          * We can play with the ili_fields bits here, because the inode lock
1711          * must be held exclusively in order to set bits there and the flush
1712          * lock protects the ili_last_fields bits.  Set ili_logged so the flush
1713          * done routine can tell whether or not to look in the AIL.  Also, store
1714          * the current LSN of the inode so that we can tell whether the item has
1715          * moved in the AIL from xfs_iflush_done().  In order to read the lsn we
1716          * need the AIL lock, because it is a 64 bit value that cannot be read
1717          * atomically.
1718          */
1719         iip->ili_last_fields = iip->ili_fields;
1720         iip->ili_fields = 0;
1721         iip->ili_logged = 1;
1722
1723         xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
1724                                 &iip->ili_item.li_lsn);
1725
1726         /*
1727          * Attach the function xfs_iflush_done to the inode's
1728          * buffer.  This will remove the inode from the AIL
1729          * and unlock the inode's flush lock when the inode is
1730          * completely written to disk.
1731          */
1732         xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
1733
1734         /* update the lsn in the on disk inode if required */
1735         if (ip->i_d.di_version == 3)
1736                 dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn);
1737
1738         /* generate the checksum. */
1739         xfs_dinode_calc_crc(mp, dip);
1740
1741         ASSERT(bp->b_fspriv != NULL);
1742         ASSERT(bp->b_iodone != NULL);
1743         return 0;
1744
1745 corrupt_out:
1746         return XFS_ERROR(EFSCORRUPTED);
1747 }
1748
1749 /*
1750  * Test whether it is appropriate to check an inode for and free post EOF
1751  * blocks. The 'force' parameter determines whether we should also consider
1752  * regular files that are marked preallocated or append-only.
1753  */
1754 bool
1755 xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
1756 {
1757         /* prealloc/delalloc exists only on regular files */
1758         if (!S_ISREG(ip->i_d.di_mode))
1759                 return false;
1760
1761         /*
1762          * Zero sized files with no cached pages and delalloc blocks will not
1763          * have speculative prealloc/delalloc blocks to remove.
1764          */
1765         if (VFS_I(ip)->i_size == 0 &&
1766             VN_CACHED(VFS_I(ip)) == 0 &&
1767             ip->i_delayed_blks == 0)
1768                 return false;
1769
1770         /* If we haven't read in the extent list, then don't do it now. */
1771         if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
1772                 return false;
1773
1774         /*
1775          * Do not free real preallocated or append-only files unless the file
1776          * has delalloc blocks and we are forced to remove them.
1777          */
1778         if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
1779                 if (!force || ip->i_delayed_blks == 0)
1780                         return false;
1781
1782         return true;
1783 }