fs/btrfs/tree-log.c

   1 /*
   2  * Copyright (C) 2008 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18
  19 #include <linux/sched.h>
  20 #include <linux/slab.h>
  21 #include <linux/blkdev.h>
  22 #include <linux/list_sort.h>
  23 #include "ctree.h"
  24 #include "transaction.h"
  25 #include "disk-io.h"
  26 #include "locking.h"
  27 #include "print-tree.h"
  28 #include "backref.h"
  29 #include "compat.h"
  30 #include "tree-log.h"
  31 #include "hash.h"
  32
  33 /* magic values for the inode_only field in btrfs_log_inode:
  34  *
  35  * LOG_INODE_ALL means to log everything
  36  * LOG_INODE_EXISTS means to log just enough to recreate the inode
  37  * during log replay
  38  */
  39 #define LOG_INODE_ALL 0
  40 #define LOG_INODE_EXISTS 1
  41
  42 /*
  43  * directory trouble cases
  44  *
  45  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
  46  * log, we must force a full commit before doing an fsync of the directory
  47  * where the unlink was done.
  48  * ---> record transid of last unlink/rename per directory
  49  *
  50  * mkdir foo/some_dir
  51  * normal commit
  52  * rename foo/some_dir foo2/some_dir
  53  * mkdir foo/some_dir
  54  * fsync foo/some_dir/some_file
  55  *
  56  * The fsync above will unlink the original some_dir without recording
  57  * it in its new location (foo2).  After a crash, some_dir will be gone
  58  * unless the fsync of some_file forces a full commit
  59  *
  60  * 2) we must log any new names for any file or dir that is in the fsync
  61  * log. ---> check inode while renaming/linking.
  62  *
  63  * 2a) we must log any new names for any file or dir during rename
  64  * when the directory they are being removed from was logged.
  65  * ---> check inode and old parent dir during rename
  66  *
  67  *  2a is actually the more important variant.  With the extra logging
  68  *  a crash might unlink the old name without recreating the new one
  69  *
  70  * 3) after a crash, we must go through any directories with a link count
  71  * of zero and redo the rm -rf
  72  *
  73  * mkdir f1/foo
  74  * normal commit
  75  * rm -rf f1/foo
  76  * fsync(f1)
  77  *
  78  * The directory f1 was fully removed from the FS, but fsync was never
  79  * called on f1, only its parent dir.  After a crash the rm -rf must
  80  * be replayed.  This must be able to recurse down the entire
  81  * directory tree.  The inode link count fixup code takes care of the
  82  * ugly details.
  83  */
  84
  85 /*
  86  * stages for the tree walking.  The first
  87  * stage (0) is to only pin down the blocks we find
  88  * the second stage (1) is to make sure that all the inodes
  89  * we find in the log are created in the subvolume.
  90  *
  91  * The last stage is to deal with directories and links and extents
  92  * and all the other fun semantics
  93  */
  94 #define LOG_WALK_PIN_ONLY 0
  95 #define LOG_WALK_REPLAY_INODES 1
  96 #define LOG_WALK_REPLAY_ALL 2
  97
  98 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
  99                              struct btrfs_root *root, struct inode *inode,
 100                              int inode_only);
 101 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 102                              struct btrfs_root *root,
 103                              struct btrfs_path *path, u64 objectid);
 104 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 105                                        struct btrfs_root *root,
 106                                        struct btrfs_root *log,
 107                                        struct btrfs_path *path,
 108                                        u64 dirid, int del_all);
 109
 110 /*
 111  * tree logging is a special write ahead log used to make sure that
 112  * fsyncs and O_SYNCs can happen without doing full tree commits.
 113  *
 114  * Full tree commits are expensive because they require commonly
 115  * modified blocks to be recowed, creating many dirty pages in the
 116  * extent tree an 4x-6x higher write load than ext3.
 117  *
 118  * Instead of doing a tree commit on every fsync, we use the
 119  * key ranges and transaction ids to find items for a given file or directory
 120  * that have changed in this transaction.  Those items are copied into
 121  * a special tree (one per subvolume root), that tree is written to disk
 122  * and then the fsync is considered complete.
 123  *
 124  * After a crash, items are copied out of the log-tree back into the
 125  * subvolume tree.  Any file data extents found are recorded in the extent
 126  * allocation tree, and the log-tree freed.
 127  *
 128  * The log tree is read three times, once to pin down all the extents it is
 129  * using in ram and once, once to create all the inodes logged in the tree
 130  * and once to do all the other items.
 131  */
 132
 133 /*
 134  * start a sub transaction and setup the log tree
 135  * this increments the log tree writer count to make the people
 136  * syncing the tree wait for us to finish
 137  */
 138 static int start_log_trans(struct btrfs_trans_handle *trans,
 139                            struct btrfs_root *root)
 140 {
 141         int ret;
 142         int err = 0;
 143
 144         mutex_lock(&root->log_mutex);
 145         if (root->log_root) {
 146                 if (!root->log_start_pid) {
 147                         root->log_start_pid = current->pid;
 148                         root->log_multiple_pids = false;
 149                 } else if (root->log_start_pid != current->pid) {
 150                         root->log_multiple_pids = true;
 151                 }
 152
 153                 atomic_inc(&root->log_batch);
 154                 atomic_inc(&root->log_writers);
 155                 mutex_unlock(&root->log_mutex);
 156                 return 0;
 157         }
 158         root->log_multiple_pids = false;
 159         root->log_start_pid = current->pid;
 160         mutex_lock(&root->fs_info->tree_log_mutex);
 161         if (!root->fs_info->log_root_tree) {
 162                 ret = btrfs_init_log_root_tree(trans, root->fs_info);
 163                 if (ret)
 164                         err = ret;
 165         }
 166         if (err == 0 && !root->log_root) {
 167                 ret = btrfs_add_log_tree(trans, root);
 168                 if (ret)
 169                         err = ret;
 170         }
 171         mutex_unlock(&root->fs_info->tree_log_mutex);
 172         atomic_inc(&root->log_batch);
 173         atomic_inc(&root->log_writers);
 174         mutex_unlock(&root->log_mutex);
 175         return err;
 176 }
 177
 178 /*
 179  * returns 0 if there was a log transaction running and we were able
 180  * to join, or returns -ENOENT if there were not transactions
 181  * in progress
 182  */
 183 static int join_running_log_trans(struct btrfs_root *root)
 184 {
 185         int ret = -ENOENT;
 186
 187         smp_mb();
 188         if (!root->log_root)
 189                 return -ENOENT;
 190
 191         mutex_lock(&root->log_mutex);
 192         if (root->log_root) {
 193                 ret = 0;
 194                 atomic_inc(&root->log_writers);
 195         }
 196         mutex_unlock(&root->log_mutex);
 197         return ret;
 198 }
 199
 200 /*
 201  * This either makes the current running log transaction wait
 202  * until you call btrfs_end_log_trans() or it makes any future
 203  * log transactions wait until you call btrfs_end_log_trans()
 204  */
 205 int btrfs_pin_log_trans(struct btrfs_root *root)
 206 {
 207         int ret = -ENOENT;
 208
 209         mutex_lock(&root->log_mutex);
 210         atomic_inc(&root->log_writers);
 211         mutex_unlock(&root->log_mutex);
 212         return ret;
 213 }
 214
 215 /*
 216  * indicate we're done making changes to the log tree
 217  * and wake up anyone waiting to do a sync
 218  */
 219 void btrfs_end_log_trans(struct btrfs_root *root)
 220 {
 221         if (atomic_dec_and_test(&root->log_writers)) {
 222                 smp_mb();
 223                 if (waitqueue_active(&root->log_writer_wait))
 224                         wake_up(&root->log_writer_wait);
 225         }
 226 }
 227
 228
 229 /*
 230  * the walk control struct is used to pass state down the chain when
 231  * processing the log tree.  The stage field tells us which part
 232  * of the log tree processing we are currently doing.  The others
 233  * are state fields used for that specific part
 234  */
 235 struct walk_control {
 236         /* should we free the extent on disk when done?  This is used
 237          * at transaction commit time while freeing a log tree
 238          */
 239         int free;
 240
 241         /* should we write out the extent buffer?  This is used
 242          * while flushing the log tree to disk during a sync
 243          */
 244         int write;
 245
 246         /* should we wait for the extent buffer io to finish?  Also used
 247          * while flushing the log tree to disk for a sync
 248          */
 249         int wait;
 250
 251         /* pin only walk, we record which extents on disk belong to the
 252          * log trees
 253          */
 254         int pin;
 255
 256         /* what stage of the replay code we're currently in */
 257         int stage;
 258
 259         /* the root we are currently replaying */
 260         struct btrfs_root *replay_dest;
 261
 262         /* the trans handle for the current replay */
 263         struct btrfs_trans_handle *trans;
 264
 265         /* the function that gets used to process blocks we find in the
 266          * tree.  Note the extent_buffer might not be up to date when it is
 267          * passed in, and it must be checked or read if you need the data
 268          * inside it
 269          */
 270         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
 271                             struct walk_control *wc, u64 gen);
 272 };
 273
 274 /*
 275  * process_func used to pin down extents, write them or wait on them
 276  */
 277 static int process_one_buffer(struct btrfs_root *log,
 278                               struct extent_buffer *eb,
 279                               struct walk_control *wc, u64 gen)
 280 {
 281         int ret = 0;
 282
 283         /*
 284          * If this fs is mixed then we need to be able to process the leaves to
 285          * pin down any logged extents, so we have to read the block.
 286          */
 287         if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
 288                 ret = btrfs_read_buffer(eb, gen);
 289                 if (ret)
 290                         return ret;
 291         }
 292
 293         if (wc->pin)
 294                 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
 295                                                       eb->start, eb->len);
 296
 297         if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
 298                 if (wc->pin && btrfs_header_level(eb) == 0)
 299                         ret = btrfs_exclude_logged_extents(log, eb);
 300                 if (wc->write)
 301                         btrfs_write_tree_block(eb);
 302                 if (wc->wait)
 303                         btrfs_wait_tree_block_writeback(eb);
 304         }
 305         return ret;
 306 }
 307
 308 /*
 309  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
 310  * to the src data we are copying out.
 311  *
 312  * root is the tree we are copying into, and path is a scratch
 313  * path for use in this function (it should be released on entry and
 314  * will be released on exit).
 315  *
 316  * If the key is already in the destination tree the existing item is
 317  * overwritten.  If the existing item isn't big enough, it is extended.
 318  * If it is too large, it is truncated.
 319  *
 320  * If the key isn't in the destination yet, a new item is inserted.
 321  */
 322 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 323                                    struct btrfs_root *root,
 324                                    struct btrfs_path *path,
 325                                    struct extent_buffer *eb, int slot,
 326                                    struct btrfs_key *key)
 327 {
 328         int ret;
 329         u32 item_size;
 330         u64 saved_i_size = 0;
 331         int save_old_i_size = 0;
 332         unsigned long src_ptr;
 333         unsigned long dst_ptr;
 334         int overwrite_root = 0;
 335         bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 336
 337         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 338                 overwrite_root = 1;
 339
 340         item_size = btrfs_item_size_nr(eb, slot);
 341         src_ptr = btrfs_item_ptr_offset(eb, slot);
 342
 343         /* look for the key in the destination tree */
 344         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 345         if (ret < 0)
 346                 return ret;
 347
 348         if (ret == 0) {
 349                 char *src_copy;
 350                 char *dst_copy;
 351                 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
 352                                                   path->slots[0]);
 353                 if (dst_size != item_size)
 354                         goto insert;
 355
 356                 if (item_size == 0) {
 357                         btrfs_release_path(path);
 358                         return 0;
 359                 }
 360                 dst_copy = kmalloc(item_size, GFP_NOFS);
 361                 src_copy = kmalloc(item_size, GFP_NOFS);
 362                 if (!dst_copy || !src_copy) {
 363                         btrfs_release_path(path);
 364                         kfree(dst_copy);
 365                         kfree(src_copy);
 366                         return -ENOMEM;
 367                 }
 368
 369                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
 370
 371                 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 372                 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
 373                                    item_size);
 374                 ret = memcmp(dst_copy, src_copy, item_size);
 375
 376                 kfree(dst_copy);
 377                 kfree(src_copy);
 378                 /*
 379                  * they have the same contents, just return, this saves
 380                  * us from cowing blocks in the destination tree and doing
 381                  * extra writes that may not have been done by a previous
 382                  * sync
 383                  */
 384                 if (ret == 0) {
 385                         btrfs_release_path(path);
 386                         return 0;
 387                 }
 388
 389                 /*
 390                  * We need to load the old nbytes into the inode so when we
 391                  * replay the extents we've logged we get the right nbytes.
 392                  */
 393                 if (inode_item) {
 394                         struct btrfs_inode_item *item;
 395                         u64 nbytes;
 396
 397                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 398                                               struct btrfs_inode_item);
 399                         nbytes = btrfs_inode_nbytes(path->nodes[0], item);
 400                         item = btrfs_item_ptr(eb, slot,
 401                                               struct btrfs_inode_item);
 402                         btrfs_set_inode_nbytes(eb, item, nbytes);
 403                 }
 404         } else if (inode_item) {
 405                 struct btrfs_inode_item *item;
 406
 407                 /*
 408                  * New inode, set nbytes to 0 so that the nbytes comes out
 409                  * properly when we replay the extents.
 410                  */
 411                 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
 412                 btrfs_set_inode_nbytes(eb, item, 0);
 413         }
 414 insert:
 415         btrfs_release_path(path);
 416         /* try to insert the key into the destination tree */
 417         ret = btrfs_insert_empty_item(trans, root, path,
 418                                       key, item_size);
 419
 420         /* make sure any existing item is the correct size */
 421         if (ret == -EEXIST) {
 422                 u32 found_size;
 423                 found_size = btrfs_item_size_nr(path->nodes[0],
 424                                                 path->slots[0]);
 425                 if (found_size > item_size)
 426                         btrfs_truncate_item(root, path, item_size, 1);
 427                 else if (found_size < item_size)
 428                         btrfs_extend_item(root, path,
 429                                           item_size - found_size);
 430         } else if (ret) {
 431                 return ret;
 432         }
 433         dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
 434                                         path->slots[0]);
 435
 436         /* don't overwrite an existing inode if the generation number
 437          * was logged as zero.  This is done when the tree logging code
 438          * is just logging an inode to make sure it exists after recovery.
 439          *
 440          * Also, don't overwrite i_size on directories during replay.
 441          * log replay inserts and removes directory items based on the
 442          * state of the tree found in the subvolume, and i_size is modified
 443          * as it goes
 444          */
 445         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
 446                 struct btrfs_inode_item *src_item;
 447                 struct btrfs_inode_item *dst_item;
 448
 449                 src_item = (struct btrfs_inode_item *)src_ptr;
 450                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 451
 452                 if (btrfs_inode_generation(eb, src_item) == 0)
 453                         goto no_copy;
 454
 455                 if (overwrite_root &&
 456                     S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
 457                     S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
 458                         save_old_i_size = 1;
 459                         saved_i_size = btrfs_inode_size(path->nodes[0],
 460                                                         dst_item);
 461                 }
 462         }
 463
 464         copy_extent_buffer(path->nodes[0], eb, dst_ptr,
 465                            src_ptr, item_size);
 466
 467         if (save_old_i_size) {
 468                 struct btrfs_inode_item *dst_item;
 469                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 470                 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
 471         }
 472
 473         /* make sure the generation is filled in */
 474         if (key->type == BTRFS_INODE_ITEM_KEY) {
 475                 struct btrfs_inode_item *dst_item;
 476                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 477                 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
 478                         btrfs_set_inode_generation(path->nodes[0], dst_item,
 479                                                    trans->transid);
 480                 }
 481         }
 482 no_copy:
 483         btrfs_mark_buffer_dirty(path->nodes[0]);
 484         btrfs_release_path(path);
 485         return 0;
 486 }
 487
 488 /*
 489  * simple helper to read an inode off the disk from a given root
 490  * This can only be called for subvolume roots and not for the log
 491  */
 492 static noinline struct inode *read_one_inode(struct btrfs_root *root,
 493                                              u64 objectid)
 494 {
 495         struct btrfs_key key;
 496         struct inode *inode;
 497
 498         key.objectid = objectid;
 499         key.type = BTRFS_INODE_ITEM_KEY;
 500         key.offset = 0;
 501         inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
 502         if (IS_ERR(inode)) {
 503                 inode = NULL;
 504         } else if (is_bad_inode(inode)) {
 505                 iput(inode);
 506                 inode = NULL;
 507         }
 508         return inode;
 509 }
 510
 511 /* replays a single extent in 'eb' at 'slot' with 'key' into the
 512  * subvolume 'root'.  path is released on entry and should be released
 513  * on exit.
 514  *
 515  * extents in the log tree have not been allocated out of the extent
 516  * tree yet.  So, this completes the allocation, taking a reference
 517  * as required if the extent already exists or creating a new extent
 518  * if it isn't in the extent allocation tree yet.
 519  *
 520  * The extent is inserted into the file, dropping any existing extents
 521  * from the file that overlap the new one.
 522  */
 523 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 524                                       struct btrfs_root *root,
 525                                       struct btrfs_path *path,
 526                                       struct extent_buffer *eb, int slot,
 527                                       struct btrfs_key *key)
 528 {
 529         int found_type;
 530         u64 extent_end;
 531         u64 start = key->offset;
 532         u64 nbytes = 0;
 533         struct btrfs_file_extent_item *item;
 534         struct inode *inode = NULL;
 535         unsigned long size;
 536         int ret = 0;
 537
 538         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 539         found_type = btrfs_file_extent_type(eb, item);
 540
 541         if (found_type == BTRFS_FILE_EXTENT_REG ||
 542             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 543                 nbytes = btrfs_file_extent_num_bytes(eb, item);
 544                 extent_end = start + nbytes;
 545
 546                 /*
 547                  * We don't add to the inodes nbytes if we are prealloc or a
 548                  * hole.
 549                  */
 550                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 551                         nbytes = 0;
 552         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 553                 size = btrfs_file_extent_inline_len(eb, item);
 554                 nbytes = btrfs_file_extent_ram_bytes(eb, item);
 555                 extent_end = ALIGN(start + size, root->sectorsize);
 556         } else {
 557                 ret = 0;
 558                 goto out;
 559         }
 560
 561         inode = read_one_inode(root, key->objectid);
 562         if (!inode) {
 563                 ret = -EIO;
 564                 goto out;
 565         }
 566
 567         /*
 568          * first check to see if we already have this extent in the
 569          * file.  This must be done before the btrfs_drop_extents run
 570          * so we don't try to drop this extent.
 571          */
 572         ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
 573                                        start, 0);
 574
 575         if (ret == 0 &&
 576             (found_type == BTRFS_FILE_EXTENT_REG ||
 577              found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 578                 struct btrfs_file_extent_item cmp1;
 579                 struct btrfs_file_extent_item cmp2;
 580                 struct btrfs_file_extent_item *existing;
 581                 struct extent_buffer *leaf;
 582
 583                 leaf = path->nodes[0];
 584                 existing = btrfs_item_ptr(leaf, path->slots[0],
 585                                           struct btrfs_file_extent_item);
 586
 587                 read_extent_buffer(eb, &cmp1, (unsigned long)item,
 588                                    sizeof(cmp1));
 589                 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
 590                                    sizeof(cmp2));
 591
 592                 /*
 593                  * we already have a pointer to this exact extent,
 594                  * we don't have to do anything
 595                  */
 596                 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
 597                         btrfs_release_path(path);
 598                         goto out;
 599                 }
 600         }
 601         btrfs_release_path(path);
 602
 603         /* drop any overlapping extents */
 604         ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
 605         if (ret)
 606                 goto out;
 607
 608         if (found_type == BTRFS_FILE_EXTENT_REG ||
 609             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 610                 u64 offset;
 611                 unsigned long dest_offset;
 612                 struct btrfs_key ins;
 613
 614                 ret = btrfs_insert_empty_item(trans, root, path, key,
 615                                               sizeof(*item));
 616                 if (ret)
 617                         goto out;
 618                 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 619                                                     path->slots[0]);
 620                 copy_extent_buffer(path->nodes[0], eb, dest_offset,
 621                                 (unsigned long)item,  sizeof(*item));
 622
 623                 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 624                 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 625                 ins.type = BTRFS_EXTENT_ITEM_KEY;
 626                 offset = key->offset - btrfs_file_extent_offset(eb, item);
 627
 628                 if (ins.objectid > 0) {
 629                         u64 csum_start;
 630                         u64 csum_end;
 631                         LIST_HEAD(ordered_sums);
 632                         /*
 633                          * is this extent already allocated in the extent
 634                          * allocation tree?  If so, just add a reference
 635                          */
 636                         ret = btrfs_lookup_extent(root, ins.objectid,
 637                                                 ins.offset);
 638                         if (ret == 0) {
 639                                 ret = btrfs_inc_extent_ref(trans, root,
 640                                                 ins.objectid, ins.offset,
 641                                                 0, root->root_key.objectid,
 642                                                 key->objectid, offset, 0);
 643                                 if (ret)
 644                                         goto out;
 645                         } else {
 646                                 /*
 647                                  * insert the extent pointer in the extent
 648                                  * allocation tree
 649                                  */
 650                                 ret = btrfs_alloc_logged_file_extent(trans,
 651                                                 root, root->root_key.objectid,
 652                                                 key->objectid, offset, &ins);
 653                                 if (ret)
 654                                         goto out;
 655                         }
 656                         btrfs_release_path(path);
 657
 658                         if (btrfs_file_extent_compression(eb, item)) {
 659                                 csum_start = ins.objectid;
 660                                 csum_end = csum_start + ins.offset;
 661                         } else {
 662                                 csum_start = ins.objectid +
 663                                         btrfs_file_extent_offset(eb, item);
 664                                 csum_end = csum_start +
 665                                         btrfs_file_extent_num_bytes(eb, item);
 666                         }
 667
 668                         ret = btrfs_lookup_csums_range(root->log_root,
 669                                                 csum_start, csum_end - 1,
 670                                                 &ordered_sums, 0);
 671                         if (ret)
 672                                 goto out;
 673                         while (!list_empty(&ordered_sums)) {
 674                                 struct btrfs_ordered_sum *sums;
 675                                 sums = list_entry(ordered_sums.next,
 676                                                 struct btrfs_ordered_sum,
 677                                                 list);
 678                                 if (!ret)
 679                                         ret = btrfs_csum_file_blocks(trans,
 680                                                 root->fs_info->csum_root,
 681                                                 sums);
 682                                 list_del(&sums->list);
 683                                 kfree(sums);
 684                         }
 685                         if (ret)
 686                                 goto out;
 687                 } else {
 688                         btrfs_release_path(path);
 689                 }
 690         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 691                 /* inline extents are easy, we just overwrite them */
 692                 ret = overwrite_item(trans, root, path, eb, slot, key);
 693                 if (ret)
 694                         goto out;
 695         }
 696
 697         inode_add_bytes(inode, nbytes);
 698         ret = btrfs_update_inode(trans, root, inode);
 699 out:
 700         if (inode)
 701                 iput(inode);
 702         return ret;
 703 }
 704
 705 /*
 706  * when cleaning up conflicts between the directory names in the
 707  * subvolume, directory names in the log and directory names in the
 708  * inode back references, we may have to unlink inodes from directories.
 709  *
 710  * This is a helper function to do the unlink of a specific directory
 711  * item
 712  */
 713 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 714                                       struct btrfs_root *root,
 715                                       struct btrfs_path *path,
 716                                       struct inode *dir,
 717                                       struct btrfs_dir_item *di)
 718 {
 719         struct inode *inode;
 720         char *name;
 721         int name_len;
 722         struct extent_buffer *leaf;
 723         struct btrfs_key location;
 724         int ret;
 725
 726         leaf = path->nodes[0];
 727
 728         btrfs_dir_item_key_to_cpu(leaf, di, &location);
 729         name_len = btrfs_dir_name_len(leaf, di);
 730         name = kmalloc(name_len, GFP_NOFS);
 731         if (!name)
 732                 return -ENOMEM;
 733
 734         read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
 735         btrfs_release_path(path);
 736
 737         inode = read_one_inode(root, location.objectid);
 738         if (!inode) {
 739                 ret = -EIO;
 740                 goto out;
 741         }
 742
 743         ret = link_to_fixup_dir(trans, root, path, location.objectid);
 744         if (ret)
 745                 goto out;
 746
 747         ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
 748         if (ret)
 749                 goto out;
 750         else
 751                 ret = btrfs_run_delayed_items(trans, root);
 752 out:
 753         kfree(name);
 754         iput(inode);
 755         return ret;
 756 }
 757
 758 /*
 759  * helper function to see if a given name and sequence number found
 760  * in an inode back reference are already in a directory and correctly
 761  * point to this inode
 762  */
 763 static noinline int inode_in_dir(struct btrfs_root *root,
 764                                  struct btrfs_path *path,
 765                                  u64 dirid, u64 objectid, u64 index,
 766                                  const char *name, int name_len)
 767 {
 768         struct btrfs_dir_item *di;
 769         struct btrfs_key location;
 770         int match = 0;
 771
 772         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
 773                                          index, name, name_len, 0);
 774         if (di && !IS_ERR(di)) {
 775                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 776                 if (location.objectid != objectid)
 777                         goto out;
 778         } else
 779                 goto out;
 780         btrfs_release_path(path);
 781
 782         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
 783         if (di && !IS_ERR(di)) {
 784                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 785                 if (location.objectid != objectid)
 786                         goto out;
 787         } else
 788                 goto out;
 789         match = 1;
 790 out:
 791         btrfs_release_path(path);
 792         return match;
 793 }
 794
 795 /*
 796  * helper function to check a log tree for a named back reference in
 797  * an inode.  This is used to decide if a back reference that is
 798  * found in the subvolume conflicts with what we find in the log.
 799  *
 800  * inode backreferences may have multiple refs in a single item,
 801  * during replay we process one reference at a time, and we don't
 802  * want to delete valid links to a file from the subvolume if that
 803  * link is also in the log.
 804  */
 805 static noinline int backref_in_log(struct btrfs_root *log,
 806                                    struct btrfs_key *key,
 807                                    u64 ref_objectid,
 808                                    char *name, int namelen)
 809 {
 810         struct btrfs_path *path;
 811         struct btrfs_inode_ref *ref;
 812         unsigned long ptr;
 813         unsigned long ptr_end;
 814         unsigned long name_ptr;
 815         int found_name_len;
 816         int item_size;
 817         int ret;
 818         int match = 0;
 819
 820         path = btrfs_alloc_path();
 821         if (!path)
 822                 return -ENOMEM;
 823
 824         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
 825         if (ret != 0)
 826                 goto out;
 827
 828         ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 829
 830         if (key->type == BTRFS_INODE_EXTREF_KEY) {
 831                 if (btrfs_find_name_in_ext_backref(path, ref_objectid,
 832                                                    name, namelen, NULL))
 833                         match = 1;
 834
 835                 goto out;
 836         }
 837
 838         item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 839         ptr_end = ptr + item_size;
 840         while (ptr < ptr_end) {
 841                 ref = (struct btrfs_inode_ref *)ptr;
 842                 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
 843                 if (found_name_len == namelen) {
 844                         name_ptr = (unsigned long)(ref + 1);
 845                         ret = memcmp_extent_buffer(path->nodes[0], name,
 846                                                    name_ptr, namelen);
 847                         if (ret == 0) {
 848                                 match = 1;
 849                                 goto out;
 850                         }
 851                 }
 852                 ptr = (unsigned long)(ref + 1) + found_name_len;
 853         }
 854 out:
 855         btrfs_free_path(path);
 856         return match;
 857 }
 858
 859 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 860                                   struct btrfs_root *root,
 861                                   struct btrfs_path *path,
 862                                   struct btrfs_root *log_root,
 863                                   struct inode *dir, struct inode *inode,
 864                                   struct extent_buffer *eb,
 865                                   u64 inode_objectid, u64 parent_objectid,
 866                                   u64 ref_index, char *name, int namelen,
 867                                   int *search_done)
 868 {
 869         int ret;
 870         char *victim_name;
 871         int victim_name_len;
 872         struct extent_buffer *leaf;
 873         struct btrfs_dir_item *di;
 874         struct btrfs_key search_key;
 875         struct btrfs_inode_extref *extref;
 876
 877 again:
 878         /* Search old style refs */
 879         search_key.objectid = inode_objectid;
 880         search_key.type = BTRFS_INODE_REF_KEY;
 881         search_key.offset = parent_objectid;
 882         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 883         if (ret == 0) {
 884                 struct btrfs_inode_ref *victim_ref;
 885                 unsigned long ptr;
 886                 unsigned long ptr_end;
 887
 888                 leaf = path->nodes[0];
 889
 890                 /* are we trying to overwrite a back ref for the root directory
 891                  * if so, just jump out, we're done
 892                  */
 893                 if (search_key.objectid == search_key.offset)
 894                         return 1;
 895
 896                 /* check all the names in this back reference to see
 897                  * if they are in the log.  if so, we allow them to stay
 898                  * otherwise they must be unlinked as a conflict
 899                  */
 900                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 901                 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
 902                 while (ptr < ptr_end) {
 903                         victim_ref = (struct btrfs_inode_ref *)ptr;
 904                         victim_name_len = btrfs_inode_ref_name_len(leaf,
 905                                                                    victim_ref);
 906                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
 907                         if (!victim_name)
 908                                 return -ENOMEM;
 909
 910                         read_extent_buffer(leaf, victim_name,
 911                                            (unsigned long)(victim_ref + 1),
 912                                            victim_name_len);
 913
 914                         if (!backref_in_log(log_root, &search_key,
 915                                             parent_objectid,
 916                                             victim_name,
 917                                             victim_name_len)) {
 918                                 btrfs_inc_nlink(inode);
 919                                 btrfs_release_path(path);
 920
 921                                 ret = btrfs_unlink_inode(trans, root, dir,
 922                                                          inode, victim_name,
 923                                                          victim_name_len);
 924                                 kfree(victim_name);
 925                                 if (ret)
 926                                         return ret;
 927                                 ret = btrfs_run_delayed_items(trans, root);
 928                                 if (ret)
 929                                         return ret;
 930                                 *search_done = 1;
 931                                 goto again;
 932                         }
 933                         kfree(victim_name);
 934
 935                         ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
 936                 }
 937
 938                 /*
 939                  * NOTE: we have searched root tree and checked the
 940                  * coresponding ref, it does not need to check again.
 941                  */
 942                 *search_done = 1;
 943         }
 944         btrfs_release_path(path);
 945
 946         /* Same search but for extended refs */
 947         extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
 948                                            inode_objectid, parent_objectid, 0,
 949                                            0);
 950         if (!IS_ERR_OR_NULL(extref)) {
 951                 u32 item_size;
 952                 u32 cur_offset = 0;
 953                 unsigned long base;
 954                 struct inode *victim_parent;
 955
 956                 leaf = path->nodes[0];
 957
 958                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 959                 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
 960
 961                 while (cur_offset < item_size) {
 962                         extref = (struct btrfs_inode_extref *)base + cur_offset;
 963
 964                         victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
 965
 966                         if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
 967                                 goto next;
 968
 969                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
 970                         if (!victim_name)
 971                                 return -ENOMEM;
 972                         read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
 973                                            victim_name_len);
 974
 975                         search_key.objectid = inode_objectid;
 976                         search_key.type = BTRFS_INODE_EXTREF_KEY;
 977                         search_key.offset = btrfs_extref_hash(parent_objectid,
 978                                                               victim_name,
 979                                                               victim_name_len);
 980                         ret = 0;
 981                         if (!backref_in_log(log_root, &search_key,
 982                                             parent_objectid, victim_name,
 983                                             victim_name_len)) {
 984                                 ret = -ENOENT;
 985                                 victim_parent = read_one_inode(root,
 986                                                                parent_objectid);
 987                                 if (victim_parent) {
 988                                         btrfs_inc_nlink(inode);
 989                                         btrfs_release_path(path);
 990
 991                                         ret = btrfs_unlink_inode(trans, root,
 992                                                                  victim_parent,
 993                                                                  inode,
 994                                                                  victim_name,
 995                                                                  victim_name_len);
 996                                         if (!ret)
 997                                                 ret = btrfs_run_delayed_items(
 998                                                                   trans, root);
 999                                 }
1000                                 iput(victim_parent);
1001                                 kfree(victim_name);
1002                                 if (ret)
1003                                         return ret;
1004                                 *search_done = 1;
1005                                 goto again;
1006                         }
1007                         kfree(victim_name);
1008                         if (ret)
1009                                 return ret;
1010 next:
1011                         cur_offset += victim_name_len + sizeof(*extref);
1012                 }
1013                 *search_done = 1;
1014         }
1015         btrfs_release_path(path);
1016
1017         /* look for a conflicting sequence number */
1018         di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1019                                          ref_index, name, namelen, 0);
1020         if (di && !IS_ERR(di)) {
1021                 ret = drop_one_dir_item(trans, root, path, dir, di);
1022                 if (ret)
1023                         return ret;
1024         }
1025         btrfs_release_path(path);
1026
1027         /* look for a conflicing name */
1028         di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1029                                    name, namelen, 0);
1030         if (di && !IS_ERR(di)) {
1031                 ret = drop_one_dir_item(trans, root, path, dir, di);
1032                 if (ret)
1033                         return ret;
1034         }
1035         btrfs_release_path(path);
1036
1037         return 0;
1038 }
1039
1040 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1041                              u32 *namelen, char **name, u64 *index,
1042                              u64 *parent_objectid)
1043 {
1044         struct btrfs_inode_extref *extref;
1045
1046         extref = (struct btrfs_inode_extref *)ref_ptr;
1047
1048         *namelen = btrfs_inode_extref_name_len(eb, extref);
1049         *name = kmalloc(*namelen, GFP_NOFS);
1050         if (*name == NULL)
1051                 return -ENOMEM;
1052
1053         read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1054                            *namelen);
1055
1056         *index = btrfs_inode_extref_index(eb, extref);
1057         if (parent_objectid)
1058                 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1059
1060         return 0;
1061 }
1062
1063 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1064                           u32 *namelen, char **name, u64 *index)
1065 {
1066         struct btrfs_inode_ref *ref;
1067
1068         ref = (struct btrfs_inode_ref *)ref_ptr;
1069
1070         *namelen = btrfs_inode_ref_name_len(eb, ref);
1071         *name = kmalloc(*namelen, GFP_NOFS);
1072         if (*name == NULL)
1073                 return -ENOMEM;
1074
1075         read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1076
1077         *index = btrfs_inode_ref_index(eb, ref);
1078
1079         return 0;
1080 }
1081
1082 /*
1083  * replay one inode back reference item found in the log tree.
1084  * eb, slot and key refer to the buffer and key found in the log tree.
1085  * root is the destination we are replaying into, and path is for temp
1086  * use by this function.  (it should be released on return).
1087  */
1088 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1089                                   struct btrfs_root *root,
1090                                   struct btrfs_root *log,
1091                                   struct btrfs_path *path,
1092                                   struct extent_buffer *eb, int slot,
1093                                   struct btrfs_key *key)
1094 {
1095         struct inode *dir;
1096         struct inode *inode;
1097         unsigned long ref_ptr;
1098         unsigned long ref_end;
1099         char *name;
1100         int namelen;
1101         int ret;
1102         int search_done = 0;
1103         int log_ref_ver = 0;
1104         u64 parent_objectid;
1105         u64 inode_objectid;
1106         u64 ref_index = 0;
1107         int ref_struct_size;
1108
1109         ref_ptr = btrfs_item_ptr_offset(eb, slot);
1110         ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1111
1112         if (key->type == BTRFS_INODE_EXTREF_KEY) {
1113                 struct btrfs_inode_extref *r;
1114
1115                 ref_struct_size = sizeof(struct btrfs_inode_extref);
1116                 log_ref_ver = 1;
1117                 r = (struct btrfs_inode_extref *)ref_ptr;
1118                 parent_objectid = btrfs_inode_extref_parent(eb, r);
1119         } else {
1120                 ref_struct_size = sizeof(struct btrfs_inode_ref);
1121                 parent_objectid = key->offset;
1122         }
1123         inode_objectid = key->objectid;
1124
1125         /*
1126          * it is possible that we didn't log all the parent directories
1127          * for a given inode.  If we don't find the dir, just don't
1128          * copy the back ref in.  The link count fixup code will take
1129          * care of the rest
1130          */
1131         dir = read_one_inode(root, parent_objectid);
1132         if (!dir)
1133                 return -ENOENT;
1134
1135         inode = read_one_inode(root, inode_objectid);
1136         if (!inode) {
1137                 iput(dir);
1138                 return -EIO;
1139         }
1140
1141         while (ref_ptr < ref_end) {
1142                 if (log_ref_ver) {
1143                         ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1144                                                 &ref_index, &parent_objectid);
1145                         /*
1146                          * parent object can change from one array
1147                          * item to another.
1148                          */
1149                         if (!dir)
1150                                 dir = read_one_inode(root, parent_objectid);
1151                         if (!dir)
1152                                 return -ENOENT;
1153                 } else {
1154                         ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1155                                              &ref_index);
1156                 }
1157                 if (ret)
1158                         return ret;
1159
1160                 /* if we already have a perfect match, we're done */
1161                 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
1162                                   ref_index, name, namelen)) {
1163                         /*
1164                          * look for a conflicting back reference in the
1165                          * metadata. if we find one we have to unlink that name
1166                          * of the file before we add our new link.  Later on, we
1167                          * overwrite any existing back reference, and we don't
1168                          * want to create dangling pointers in the directory.
1169                          */
1170
1171                         if (!search_done) {
1172                                 ret = __add_inode_ref(trans, root, path, log,
1173                                                       dir, inode, eb,
1174                                                       inode_objectid,
1175                                                       parent_objectid,
1176                                                       ref_index, name, namelen,
1177                                                       &search_done);
1178                                 if (ret == 1) {
1179                                         ret = 0;
1180                                         goto out;
1181                                 }
1182                                 if (ret)
1183                                         goto out;
1184                         }
1185
1186                         /* insert our name */
1187                         ret = btrfs_add_link(trans, dir, inode, name, namelen,
1188                                              0, ref_index);
1189                         if (ret)
1190                                 goto out;
1191
1192                         btrfs_update_inode(trans, root, inode);
1193                 }
1194
1195                 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1196                 kfree(name);
1197                 if (log_ref_ver) {
1198                         iput(dir);
1199                         dir = NULL;
1200                 }
1201         }
1202
1203         /* finally write the back reference in the inode */
1204         ret = overwrite_item(trans, root, path, eb, slot, key);
1205 out:
1206         btrfs_release_path(path);
1207         iput(dir);
1208         iput(inode);
1209         return ret;
1210 }
1211
1212 static int insert_orphan_item(struct btrfs_trans_handle *trans,
1213                               struct btrfs_root *root, u64 offset)
1214 {
1215         int ret;
1216         ret = btrfs_find_orphan_item(root, offset);
1217         if (ret > 0)
1218                 ret = btrfs_insert_orphan_item(trans, root, offset);
1219         return ret;
1220 }
1221
1222 static int count_inode_extrefs(struct btrfs_root *root,
1223                                struct inode *inode, struct btrfs_path *path)
1224 {
1225         int ret = 0;
1226         int name_len;
1227         unsigned int nlink = 0;
1228         u32 item_size;
1229         u32 cur_offset = 0;
1230         u64 inode_objectid = btrfs_ino(inode);
1231         u64 offset = 0;
1232         unsigned long ptr;
1233         struct btrfs_inode_extref *extref;
1234         struct extent_buffer *leaf;
1235
1236         while (1) {
1237                 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1238                                             &extref, &offset);
1239                 if (ret)
1240                         break;
1241
1242                 leaf = path->nodes[0];
1243                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1244                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1245
1246                 while (cur_offset < item_size) {
1247                         extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1248                         name_len = btrfs_inode_extref_name_len(leaf, extref);
1249
1250                         nlink++;
1251
1252                         cur_offset += name_len + sizeof(*extref);
1253                 }
1254
1255                 offset++;
1256                 btrfs_release_path(path);
1257         }
1258         btrfs_release_path(path);
1259
1260         if (ret < 0)
1261                 return ret;
1262         return nlink;
1263 }
1264
1265 static int count_inode_refs(struct btrfs_root *root,
1266                                struct inode *inode, struct btrfs_path *path)
1267 {
1268         int ret;
1269         struct btrfs_key key;
1270         unsigned int nlink = 0;
1271         unsigned long ptr;
1272         unsigned long ptr_end;
1273         int name_len;
1274         u64 ino = btrfs_ino(inode);
1275
1276         key.objectid = ino;
1277         key.type = BTRFS_INODE_REF_KEY;
1278         key.offset = (u64)-1;
1279
1280         while (1) {
1281                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1282                 if (ret < 0)
1283                         break;
1284                 if (ret > 0) {
1285                         if (path->slots[0] == 0)
1286                                 break;
1287                         path->slots[0]--;
1288                 }
1289                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1290                                       path->slots[0]);
1291                 if (key.objectid != ino ||
1292                     key.type != BTRFS_INODE_REF_KEY)
1293                         break;
1294                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1295                 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1296                                                    path->slots[0]);
1297                 while (ptr < ptr_end) {
1298                         struct btrfs_inode_ref *ref;
1299
1300                         ref = (struct btrfs_inode_ref *)ptr;
1301                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
1302                                                             ref);
1303                         ptr = (unsigned long)(ref + 1) + name_len;
1304                         nlink++;
1305                 }
1306
1307                 if (key.offset == 0)
1308                         break;
1309                 key.offset--;
1310                 btrfs_release_path(path);
1311         }
1312         btrfs_release_path(path);
1313
1314         return nlink;
1315 }
1316
1317 /*
1318  * There are a few corners where the link count of the file can't
1319  * be properly maintained during replay.  So, instead of adding
1320  * lots of complexity to the log code, we just scan the backrefs
1321  * for any file that has been through replay.
1322  *
1323  * The scan will update the link count on the inode to reflect the
1324  * number of back refs found.  If it goes down to zero, the iput
1325  * will free the inode.
1326  */
1327 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1328                                            struct btrfs_root *root,
1329                                            struct inode *inode)
1330 {
1331         struct btrfs_path *path;
1332         int ret;
1333         u64 nlink = 0;
1334         u64 ino = btrfs_ino(inode);
1335
1336         path = btrfs_alloc_path();
1337         if (!path)
1338                 return -ENOMEM;
1339
1340         ret = count_inode_refs(root, inode, path);
1341         if (ret < 0)
1342                 goto out;
1343
1344         nlink = ret;
1345
1346         ret = count_inode_extrefs(root, inode, path);
1347         if (ret == -ENOENT)
1348                 ret = 0;
1349
1350         if (ret < 0)
1351                 goto out;
1352
1353         nlink += ret;
1354
1355         ret = 0;
1356
1357         if (nlink != inode->i_nlink) {
1358                 set_nlink(inode, nlink);
1359                 btrfs_update_inode(trans, root, inode);
1360         }
1361         BTRFS_I(inode)->index_cnt = (u64)-1;
1362
1363         if (inode->i_nlink == 0) {
1364                 if (S_ISDIR(inode->i_mode)) {
1365                         ret = replay_dir_deletes(trans, root, NULL, path,
1366                                                  ino, 1);
1367                         if (ret)
1368                                 goto out;
1369                 }
1370                 ret = insert_orphan_item(trans, root, ino);
1371         }
1372
1373 out:
1374         btrfs_free_path(path);
1375         return ret;
1376 }
1377
1378 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1379                                             struct btrfs_root *root,
1380                                             struct btrfs_path *path)
1381 {
1382         int ret;
1383         struct btrfs_key key;
1384         struct inode *inode;
1385
1386         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1387         key.type = BTRFS_ORPHAN_ITEM_KEY;
1388         key.offset = (u64)-1;
1389         while (1) {
1390                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1391                 if (ret < 0)
1392                         break;
1393
1394                 if (ret == 1) {
1395                         if (path->slots[0] == 0)
1396                                 break;
1397                         path->slots[0]--;
1398                 }
1399
1400                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1401                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1402                     key.type != BTRFS_ORPHAN_ITEM_KEY)
1403                         break;
1404
1405                 ret = btrfs_del_item(trans, root, path);
1406                 if (ret)
1407                         goto out;
1408
1409                 btrfs_release_path(path);
1410                 inode = read_one_inode(root, key.offset);
1411                 if (!inode)
1412                         return -EIO;
1413
1414                 ret = fixup_inode_link_count(trans, root, inode);
1415                 iput(inode);
1416                 if (ret)
1417                         goto out;
1418
1419                 /*
1420                  * fixup on a directory may create new entries,
1421                  * make sure we always look for the highset possible
1422                  * offset
1423                  */
1424                 key.offset = (u64)-1;
1425         }
1426         ret = 0;
1427 out:
1428         btrfs_release_path(path);
1429         return ret;
1430 }
1431
1432
1433 /*
1434  * record a given inode in the fixup dir so we can check its link
1435  * count when replay is done.  The link count is incremented here
1436  * so the inode won't go away until we check it
1437  */
1438 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1439                                       struct btrfs_root *root,
1440                                       struct btrfs_path *path,
1441                                       u64 objectid)
1442 {
1443         struct btrfs_key key;
1444         int ret = 0;
1445         struct inode *inode;
1446
1447         inode = read_one_inode(root, objectid);
1448         if (!inode)
1449                 return -EIO;
1450
1451         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1452         btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1453         key.offset = objectid;
1454
1455         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1456
1457         btrfs_release_path(path);
1458         if (ret == 0) {
1459                 if (!inode->i_nlink)
1460                         set_nlink(inode, 1);
1461                 else
1462                         btrfs_inc_nlink(inode);
1463                 ret = btrfs_update_inode(trans, root, inode);
1464         } else if (ret == -EEXIST) {
1465                 ret = 0;
1466         } else {
1467                 BUG(); /* Logic Error */
1468         }
1469         iput(inode);
1470
1471         return ret;
1472 }
1473
1474 /*
1475  * when replaying the log for a directory, we only insert names
1476  * for inodes that actually exist.  This means an fsync on a directory
1477  * does not implicitly fsync all the new files in it
1478  */
1479 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1480                                     struct btrfs_root *root,
1481                                     struct btrfs_path *path,
1482                                     u64 dirid, u64 index,
1483                                     char *name, int name_len, u8 type,
1484                                     struct btrfs_key *location)
1485 {
1486         struct inode *inode;
1487         struct inode *dir;
1488         int ret;
1489
1490         inode = read_one_inode(root, location->objectid);
1491         if (!inode)
1492                 return -ENOENT;
1493
1494         dir = read_one_inode(root, dirid);
1495         if (!dir) {
1496                 iput(inode);
1497                 return -EIO;
1498         }
1499         ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1500
1501         /* FIXME, put inode into FIXUP list */
1502
1503         iput(inode);
1504         iput(dir);
1505         return ret;
1506 }
1507
1508 /*
1509  * take a single entry in a log directory item and replay it into
1510  * the subvolume.
1511  *
1512  * if a conflicting item exists in the subdirectory already,
1513  * the inode it points to is unlinked and put into the link count
1514  * fix up tree.
1515  *
1516  * If a name from the log points to a file or directory that does
1517  * not exist in the FS, it is skipped.  fsyncs on directories
1518  * do not force down inodes inside that directory, just changes to the
1519  * names or unlinks in a directory.
1520  */
1521 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1522                                     struct btrfs_root *root,
1523                                     struct btrfs_path *path,
1524                                     struct extent_buffer *eb,
1525                                     struct btrfs_dir_item *di,
1526                                     struct btrfs_key *key)
1527 {
1528         char *name;
1529         int name_len;
1530         struct btrfs_dir_item *dst_di;
1531         struct btrfs_key found_key;
1532         struct btrfs_key log_key;
1533         struct inode *dir;
1534         u8 log_type;
1535         int exists;
1536         int ret = 0;
1537
1538         dir = read_one_inode(root, key->objectid);
1539         if (!dir)
1540                 return -EIO;
1541
1542         name_len = btrfs_dir_name_len(eb, di);
1543         name = kmalloc(name_len, GFP_NOFS);
1544         if (!name) {
1545                 ret = -ENOMEM;
1546                 goto out;
1547         }
1548
1549         log_type = btrfs_dir_type(eb, di);
1550         read_extent_buffer(eb, name, (unsigned long)(di + 1),
1551                    name_len);
1552
1553         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1554         exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1555         if (exists == 0)
1556                 exists = 1;
1557         else
1558                 exists = 0;
1559         btrfs_release_path(path);
1560
1561         if (key->type == BTRFS_DIR_ITEM_KEY) {
1562                 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1563                                        name, name_len, 1);
1564         } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1565                 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1566                                                      key->objectid,
1567                                                      key->offset, name,
1568                                                      name_len, 1);
1569         } else {
1570                 /* Corruption */
1571                 ret = -EINVAL;
1572                 goto out;
1573         }
1574         if (IS_ERR_OR_NULL(dst_di)) {
1575                 /* we need a sequence number to insert, so we only
1576                  * do inserts for the BTRFS_DIR_INDEX_KEY types
1577                  */
1578                 if (key->type != BTRFS_DIR_INDEX_KEY)
1579                         goto out;
1580                 goto insert;
1581         }
1582
1583         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1584         /* the existing item matches the logged item */
1585         if (found_key.objectid == log_key.objectid &&
1586             found_key.type == log_key.type &&
1587             found_key.offset == log_key.offset &&
1588             btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1589                 goto out;
1590         }
1591
1592         /*
1593          * don't drop the conflicting directory entry if the inode
1594          * for the new entry doesn't exist
1595          */
1596         if (!exists)
1597                 goto out;
1598
1599         ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1600         if (ret)
1601                 goto out;
1602
1603         if (key->type == BTRFS_DIR_INDEX_KEY)
1604                 goto insert;
1605 out:
1606         btrfs_release_path(path);
1607         kfree(name);
1608         iput(dir);
1609         return ret;
1610
1611 insert:
1612         btrfs_release_path(path);
1613         ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1614                               name, name_len, log_type, &log_key);
1615         if (ret && ret != -ENOENT)
1616                 goto out;
1617         ret = 0;
1618         goto out;
1619 }
1620
1621 /*
1622  * find all the names in a directory item and reconcile them into
1623  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1624  * one name in a directory item, but the same code gets used for
1625  * both directory index types
1626  */
1627 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1628                                         struct btrfs_root *root,
1629                                         struct btrfs_path *path,
1630                                         struct extent_buffer *eb, int slot,
1631                                         struct btrfs_key *key)
1632 {
1633         int ret;
1634         u32 item_size = btrfs_item_size_nr(eb, slot);
1635         struct btrfs_dir_item *di;
1636         int name_len;
1637         unsigned long ptr;
1638         unsigned long ptr_end;
1639
1640         ptr = btrfs_item_ptr_offset(eb, slot);
1641         ptr_end = ptr + item_size;
1642         while (ptr < ptr_end) {
1643                 di = (struct btrfs_dir_item *)ptr;
1644                 if (verify_dir_item(root, eb, di))
1645                         return -EIO;
1646                 name_len = btrfs_dir_name_len(eb, di);
1647                 ret = replay_one_name(trans, root, path, eb, di, key);
1648                 if (ret)
1649                         return ret;
1650                 ptr = (unsigned long)(di + 1);
1651                 ptr += name_len;
1652         }
1653         return 0;
1654 }
1655
1656 /*
1657  * directory replay has two parts.  There are the standard directory
1658  * items in the log copied from the subvolume, and range items
1659  * created in the log while the subvolume was logged.
1660  *
1661  * The range items tell us which parts of the key space the log
1662  * is authoritative for.  During replay, if a key in the subvolume
1663  * directory is in a logged range item, but not actually in the log
1664  * that means it was deleted from the directory before the fsync
1665  * and should be removed.
1666  */
1667 static noinline int find_dir_range(struct btrfs_root *root,
1668                                    struct btrfs_path *path,
1669                                    u64 dirid, int key_type,
1670                                    u64 *start_ret, u64 *end_ret)
1671 {
1672         struct btrfs_key key;
1673         u64 found_end;
1674         struct btrfs_dir_log_item *item;
1675         int ret;
1676         int nritems;
1677
1678         if (*start_ret == (u64)-1)
1679                 return 1;
1680
1681         key.objectid = dirid;
1682         key.type = key_type;
1683         key.offset = *start_ret;
1684
1685         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1686         if (ret < 0)
1687                 goto out;
1688         if (ret > 0) {
1689                 if (path->slots[0] == 0)
1690                         goto out;
1691                 path->slots[0]--;
1692         }
1693         if (ret != 0)
1694                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1695
1696         if (key.type != key_type || key.objectid != dirid) {
1697                 ret = 1;
1698                 goto next;
1699         }
1700         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1701                               struct btrfs_dir_log_item);
1702         found_end = btrfs_dir_log_end(path->nodes[0], item);
1703
1704         if (*start_ret >= key.offset && *start_ret <= found_end) {
1705                 ret = 0;
1706                 *start_ret = key.offset;
1707                 *end_ret = found_end;
1708                 goto out;
1709         }
1710         ret = 1;
1711 next:
1712         /* check the next slot in the tree to see if it is a valid item */
1713         nritems = btrfs_header_nritems(path->nodes[0]);
1714         if (path->slots[0] >= nritems) {
1715                 ret = btrfs_next_leaf(root, path);
1716                 if (ret)
1717                         goto out;
1718         } else {
1719                 path->slots[0]++;
1720         }
1721
1722         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1723
1724         if (key.type != key_type || key.objectid != dirid) {
1725                 ret = 1;
1726                 goto out;
1727         }
1728         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1729                               struct btrfs_dir_log_item);
1730         found_end = btrfs_dir_log_end(path->nodes[0], item);
1731         *start_ret = key.offset;
1732         *end_ret = found_end;
1733         ret = 0;
1734 out:
1735         btrfs_release_path(path);
1736         return ret;
1737 }
1738
1739 /*
1740  * this looks for a given directory item in the log.  If the directory
1741  * item is not in the log, the item is removed and the inode it points
1742  * to is unlinked
1743  */
1744 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1745                                       struct btrfs_root *root,
1746                                       struct btrfs_root *log,
1747                                       struct btrfs_path *path,
1748                                       struct btrfs_path *log_path,
1749                                       struct inode *dir,
1750                                       struct btrfs_key *dir_key)
1751 {
1752         int ret;
1753         struct extent_buffer *eb;
1754         int slot;
1755         u32 item_size;
1756         struct btrfs_dir_item *di;
1757         struct btrfs_dir_item *log_di;
1758         int name_len;
1759         unsigned long ptr;
1760         unsigned long ptr_end;
1761         char *name;
1762         struct inode *inode;
1763         struct btrfs_key location;
1764
1765 again:
1766         eb = path->nodes[0];
1767         slot = path->slots[0];
1768         item_size = btrfs_item_size_nr(eb, slot);
1769         ptr = btrfs_item_ptr_offset(eb, slot);
1770         ptr_end = ptr + item_size;
1771         while (ptr < ptr_end) {
1772                 di = (struct btrfs_dir_item *)ptr;
1773                 if (verify_dir_item(root, eb, di)) {
1774                         ret = -EIO;
1775                         goto out;
1776                 }
1777
1778                 name_len = btrfs_dir_name_len(eb, di);
1779                 name = kmalloc(name_len, GFP_NOFS);
1780                 if (!name) {
1781                         ret = -ENOMEM;
1782                         goto out;
1783                 }
1784                 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1785                                   name_len);
1786                 log_di = NULL;
1787                 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
1788                         log_di = btrfs_lookup_dir_item(trans, log, log_path,
1789                                                        dir_key->objectid,
1790                                                        name, name_len, 0);
1791                 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
1792                         log_di = btrfs_lookup_dir_index_item(trans, log,
1793                                                      log_path,
1794                                                      dir_key->objectid,
1795                                                      dir_key->offset,
1796                                                      name, name_len, 0);
1797                 }
1798                 if (IS_ERR_OR_NULL(log_di)) {
1799                         btrfs_dir_item_key_to_cpu(eb, di, &location);
1800                         btrfs_release_path(path);
1801                         btrfs_release_path(log_path);
1802                         inode = read_one_inode(root, location.objectid);
1803                         if (!inode) {
1804                                 kfree(name);
1805                                 return -EIO;
1806                         }
1807
1808                         ret = link_to_fixup_dir(trans, root,
1809                                                 path, location.objectid);
1810                         if (ret) {
1811                                 kfree(name);
1812                                 iput(inode);
1813                                 goto out;
1814                         }
1815
1816                         btrfs_inc_nlink(inode);
1817                         ret = btrfs_unlink_inode(trans, root, dir, inode,
1818                                                  name, name_len);
1819                         if (!ret)
1820                                 ret = btrfs_run_delayed_items(trans, root);
1821                         kfree(name);
1822                         iput(inode);
1823                         if (ret)
1824                                 goto out;
1825
1826                         /* there might still be more names under this key
1827                          * check and repeat if required
1828                          */
1829                         ret = btrfs_search_slot(NULL, root, dir_key, path,
1830                                                 0, 0);
1831                         if (ret == 0)
1832                                 goto again;
1833                         ret = 0;
1834                         goto out;
1835                 }
1836                 btrfs_release_path(log_path);
1837                 kfree(name);
1838
1839                 ptr = (unsigned long)(di + 1);
1840                 ptr += name_len;
1841         }
1842         ret = 0;
1843 out:
1844         btrfs_release_path(path);
1845         btrfs_release_path(log_path);
1846         return ret;
1847 }
1848
1849 /*
1850  * deletion replay happens before we copy any new directory items
1851  * out of the log or out of backreferences from inodes.  It
1852  * scans the log to find ranges of keys that log is authoritative for,
1853  * and then scans the directory to find items in those ranges that are
1854  * not present in the log.
1855  *
1856  * Anything we don't find in the log is unlinked and removed from the
1857  * directory.
1858  */
1859 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1860                                        struct btrfs_root *root,
1861                                        struct btrfs_root *log,
1862                                        struct btrfs_path *path,
1863                                        u64 dirid, int del_all)
1864 {
1865         u64 range_start;
1866         u64 range_end;
1867         int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1868         int ret = 0;
1869         struct btrfs_key dir_key;
1870         struct btrfs_key found_key;
1871         struct btrfs_path *log_path;
1872         struct inode *dir;
1873
1874         dir_key.objectid = dirid;
1875         dir_key.type = BTRFS_DIR_ITEM_KEY;
1876         log_path = btrfs_alloc_path();
1877         if (!log_path)
1878                 return -ENOMEM;
1879
1880         dir = read_one_inode(root, dirid);
1881         /* it isn't an error if the inode isn't there, that can happen
1882          * because we replay the deletes before we copy in the inode item
1883          * from the log
1884          */
1885         if (!dir) {
1886                 btrfs_free_path(log_path);
1887                 return 0;
1888         }
1889 again:
1890         range_start = 0;
1891         range_end = 0;
1892         while (1) {
1893                 if (del_all)
1894                         range_end = (u64)-1;
1895                 else {
1896                         ret = find_dir_range(log, path, dirid, key_type,
1897                                              &range_start, &range_end);
1898                         if (ret != 0)
1899                                 break;
1900                 }
1901
1902                 dir_key.offset = range_start;
1903                 while (1) {
1904                         int nritems;
1905                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
1906                                                 0, 0);
1907                         if (ret < 0)
1908                                 goto out;
1909
1910                         nritems = btrfs_header_nritems(path->nodes[0]);
1911                         if (path->slots[0] >= nritems) {
1912                                 ret = btrfs_next_leaf(root, path);
1913                                 if (ret)
1914                                         break;
1915                         }
1916                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1917                                               path->slots[0]);
1918                         if (found_key.objectid != dirid ||
1919                             found_key.type != dir_key.type)
1920                                 goto next_type;
1921
1922                         if (found_key.offset > range_end)
1923                                 break;
1924
1925                         ret = check_item_in_log(trans, root, log, path,
1926                                                 log_path, dir,
1927                                                 &found_key);
1928                         if (ret)
1929                                 goto out;
1930                         if (found_key.offset == (u64)-1)
1931                                 break;
1932                         dir_key.offset = found_key.offset + 1;
1933                 }
1934                 btrfs_release_path(path);
1935                 if (range_end == (u64)-1)
1936                         break;
1937                 range_start = range_end + 1;
1938         }
1939
1940 next_type:
1941         ret = 0;
1942         if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1943                 key_type = BTRFS_DIR_LOG_INDEX_KEY;
1944                 dir_key.type = BTRFS_DIR_INDEX_KEY;
1945                 btrfs_release_path(path);
1946                 goto again;
1947         }
1948 out:
1949         btrfs_release_path(path);
1950         btrfs_free_path(log_path);
1951         iput(dir);
1952         return ret;
1953 }
1954
1955 /*
1956  * the process_func used to replay items from the log tree.  This
1957  * gets called in two different stages.  The first stage just looks
1958  * for inodes and makes sure they are all copied into the subvolume.
1959  *
1960  * The second stage copies all the other item types from the log into
1961  * the subvolume.  The two stage approach is slower, but gets rid of
1962  * lots of complexity around inodes referencing other inodes that exist
1963  * only in the log (references come from either directory items or inode
1964  * back refs).
1965  */
1966 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1967                              struct walk_control *wc, u64 gen)
1968 {
1969         int nritems;
1970         struct btrfs_path *path;
1971         struct btrfs_root *root = wc->replay_dest;
1972         struct btrfs_key key;
1973         int level;
1974         int i;
1975         int ret;
1976
1977         ret = btrfs_read_buffer(eb, gen);
1978         if (ret)
1979                 return ret;
1980
1981         level = btrfs_header_level(eb);
1982
1983         if (level != 0)
1984                 return 0;
1985
1986         path = btrfs_alloc_path();
1987         if (!path)
1988                 return -ENOMEM;
1989
1990         nritems = btrfs_header_nritems(eb);
1991         for (i = 0; i < nritems; i++) {
1992                 btrfs_item_key_to_cpu(eb, &key, i);
1993
1994                 /* inode keys are done during the first stage */
1995                 if (key.type == BTRFS_INODE_ITEM_KEY &&
1996                     wc->stage == LOG_WALK_REPLAY_INODES) {
1997                         struct btrfs_inode_item *inode_item;
1998                         u32 mode;
1999
2000                         inode_item = btrfs_item_ptr(eb, i,
2001                                             struct btrfs_inode_item);
2002                         mode = btrfs_inode_mode(eb, inode_item);
2003                         if (S_ISDIR(mode)) {
2004                                 ret = replay_dir_deletes(wc->trans,
2005                                          root, log, path, key.objectid, 0);
2006                                 if (ret)
2007                                         break;
2008                         }
2009                         ret = overwrite_item(wc->trans, root, path,
2010                                              eb, i, &key);
2011                         if (ret)
2012                                 break;
2013
2014                         /* for regular files, make sure corresponding
2015                          * orhpan item exist. extents past the new EOF
2016                          * will be truncated later by orphan cleanup.
2017                          */
2018                         if (S_ISREG(mode)) {
2019                                 ret = insert_orphan_item(wc->trans, root,
2020                                                          key.objectid);
2021                                 if (ret)
2022                                         break;
2023                         }
2024
2025                         ret = link_to_fixup_dir(wc->trans, root,
2026                                                 path, key.objectid);
2027                         if (ret)
2028                                 break;
2029                 }
2030                 if (wc->stage < LOG_WALK_REPLAY_ALL)
2031                         continue;
2032
2033                 /* these keys are simply copied */
2034                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2035                         ret = overwrite_item(wc->trans, root, path,
2036                                              eb, i, &key);
2037                         if (ret)
2038                                 break;
2039                 } else if (key.type == BTRFS_INODE_REF_KEY ||
2040                            key.type == BTRFS_INODE_EXTREF_KEY) {
2041                         ret = add_inode_ref(wc->trans, root, log, path,
2042                                             eb, i, &key);
2043                         if (ret && ret != -ENOENT)
2044                                 break;
2045                         ret = 0;
2046                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2047                         ret = replay_one_extent(wc->trans, root, path,
2048                                                 eb, i, &key);
2049                         if (ret)
2050                                 break;
2051                 } else if (key.type == BTRFS_DIR_ITEM_KEY ||
2052                            key.type == BTRFS_DIR_INDEX_KEY) {
2053                         ret = replay_one_dir_item(wc->trans, root, path,
2054                                                   eb, i, &key);
2055                         if (ret)
2056                                 break;
2057                 }
2058         }
2059         btrfs_free_path(path);
2060         return ret;
2061 }
2062
2063 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2064                                    struct btrfs_root *root,
2065                                    struct btrfs_path *path, int *level,
2066                                    struct walk_control *wc)
2067 {
2068         u64 root_owner;
2069         u64 bytenr;
2070         u64 ptr_gen;
2071         struct extent_buffer *next;
2072         struct extent_buffer *cur;
2073         struct extent_buffer *parent;
2074         u32 blocksize;
2075         int ret = 0;
2076
2077         WARN_ON(*level < 0);
2078         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2079
2080         while (*level > 0) {
2081                 WARN_ON(*level < 0);
2082                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2083                 cur = path->nodes[*level];
2084
2085                 if (btrfs_header_level(cur) != *level)
2086                         WARN_ON(1);
2087
2088                 if (path->slots[*level] >=
2089                     btrfs_header_nritems(cur))
2090                         break;
2091
2092                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2093                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2094                 blocksize = btrfs_level_size(root, *level - 1);
2095
2096                 parent = path->nodes[*level];
2097                 root_owner = btrfs_header_owner(parent);
2098
2099                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
2100                 if (!next)
2101                         return -ENOMEM;
2102
2103                 if (*level == 1) {
2104                         ret = wc->process_func(root, next, wc, ptr_gen);
2105                         if (ret) {
2106                                 free_extent_buffer(next);
2107                                 return ret;
2108                         }
2109
2110                         path->slots[*level]++;
2111                         if (wc->free) {
2112                                 ret = btrfs_read_buffer(next, ptr_gen);
2113                                 if (ret) {
2114                                         free_extent_buffer(next);
2115                                         return ret;
2116                                 }
2117
2118                                 btrfs_tree_lock(next);
2119                                 btrfs_set_lock_blocking(next);
2120                                 clean_tree_block(trans, root, next);
2121                                 btrfs_wait_tree_block_writeback(next);
2122                                 btrfs_tree_unlock(next);
2123
2124                                 WARN_ON(root_owner !=
2125                                         BTRFS_TREE_LOG_OBJECTID);
2126                                 ret = btrfs_free_and_pin_reserved_extent(root,
2127                                                          bytenr, blocksize);
2128                                 if (ret) {
2129                                         free_extent_buffer(next);
2130                                         return ret;
2131                                 }
2132                         }
2133                         free_extent_buffer(next);
2134                         continue;
2135                 }
2136                 ret = btrfs_read_buffer(next, ptr_gen);
2137                 if (ret) {
2138                         free_extent_buffer(next);
2139                         return ret;
2140                 }
2141
2142                 WARN_ON(*level <= 0);
2143                 if (path->nodes[*level-1])
2144                         free_extent_buffer(path->nodes[*level-1]);
2145                 path->nodes[*level-1] = next;
2146                 *level = btrfs_header_level(next);
2147                 path->slots[*level] = 0;
2148                 cond_resched();
2149         }
2150         WARN_ON(*level < 0);
2151         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2152
2153         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2154
2155         cond_resched();
2156         return 0;
2157 }
2158
2159 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2160                                  struct btrfs_root *root,
2161                                  struct btrfs_path *path, int *level,
2162                                  struct walk_control *wc)
2163 {
2164         u64 root_owner;
2165         int i;
2166         int slot;
2167         int ret;
2168
2169         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2170                 slot = path->slots[i];
2171                 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2172                         path->slots[i]++;
2173                         *level = i;
2174                         WARN_ON(*level == 0);
2175                         return 0;
2176                 } else {
2177                         struct extent_buffer *parent;
2178                         if (path->nodes[*level] == root->node)
2179                                 parent = path->nodes[*level];
2180                         else
2181                                 parent = path->nodes[*level + 1];
2182
2183                         root_owner = btrfs_header_owner(parent);
2184                         ret = wc->process_func(root, path->nodes[*level], wc,
2185                                  btrfs_header_generation(path->nodes[*level]));
2186                         if (ret)
2187                                 return ret;
2188
2189                         if (wc->free) {
2190                                 struct extent_buffer *next;
2191
2192                                 next = path->nodes[*level];
2193
2194                                 btrfs_tree_lock(next);
2195                                 btrfs_set_lock_blocking(next);
2196                                 clean_tree_block(trans, root, next);
2197                                 btrfs_wait_tree_block_writeback(next);
2198                                 btrfs_tree_unlock(next);
2199
2200                                 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2201                                 ret = btrfs_free_and_pin_reserved_extent(root,
2202                                                 path->nodes[*level]->start,
2203                                                 path->nodes[*level]->len);
2204                                 if (ret)
2205                                         return ret;
2206                         }
2207                         free_extent_buffer(path->nodes[*level]);
2208                         path->nodes[*level] = NULL;
2209                         *level = i + 1;
2210                 }
2211         }
2212         return 1;
2213 }
2214
2215 /*
2216  * drop the reference count on the tree rooted at 'snap'.  This traverses
2217  * the tree freeing any blocks that have a ref count of zero after being
2218  * decremented.
2219  */
2220 static int walk_log_tree(struct btrfs_trans_handle *trans,
2221                          struct btrfs_root *log, struct walk_control *wc)
2222 {
2223         int ret = 0;
2224         int wret;
2225         int level;
2226         struct btrfs_path *path;
2227         int orig_level;
2228
2229         path = btrfs_alloc_path();
2230         if (!path)
2231                 return -ENOMEM;
2232
2233         level = btrfs_header_level(log->node);
2234         orig_level = level;
2235         path->nodes[level] = log->node;
2236         extent_buffer_get(log->node);
2237         path->slots[level] = 0;
2238
2239         while (1) {
2240                 wret = walk_down_log_tree(trans, log, path, &level, wc);
2241                 if (wret > 0)
2242                         break;
2243                 if (wret < 0) {
2244                         ret = wret;
2245                         goto out;
2246                 }
2247
2248                 wret = walk_up_log_tree(trans, log, path, &level, wc);
2249                 if (wret > 0)
2250                         break;
2251                 if (wret < 0) {
2252                         ret = wret;
2253                         goto out;
2254                 }
2255         }
2256
2257         /* was the root node processed? if not, catch it here */
2258         if (path->nodes[orig_level]) {
2259                 ret = wc->process_func(log, path->nodes[orig_level], wc,
2260                          btrfs_header_generation(path->nodes[orig_level]));
2261                 if (ret)
2262                         goto out;
2263                 if (wc->free) {
2264                         struct extent_buffer *next;
2265
2266                         next = path->nodes[orig_level];
2267
2268                         btrfs_tree_lock(next);
2269                         btrfs_set_lock_blocking(next);
2270                         clean_tree_block(trans, log, next);
2271                         btrfs_wait_tree_block_writeback(next);
2272                         btrfs_tree_unlock(next);
2273
2274                         WARN_ON(log->root_key.objectid !=
2275                                 BTRFS_TREE_LOG_OBJECTID);
2276                         ret = btrfs_free_and_pin_reserved_extent(log, next->start,
2277                                                          next->len);
2278                         if (ret)
2279                                 goto out;
2280                 }
2281         }
2282
2283 out:
2284         btrfs_free_path(path);
2285         return ret;
2286 }
2287
2288 /*
2289  * helper function to update the item for a given subvolumes log root
2290  * in the tree of log roots
2291  */
2292 static int update_log_root(struct btrfs_trans_handle *trans,
2293                            struct btrfs_root *log)
2294 {
2295         int ret;
2296
2297         if (log->log_transid == 1) {
2298                 /* insert root item on the first sync */
2299                 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
2300                                 &log->root_key, &log->root_item);
2301         } else {
2302                 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2303                                 &log->root_key, &log->root_item);
2304         }
2305         return ret;
2306 }
2307
2308 static int wait_log_commit(struct btrfs_trans_handle *trans,
2309                            struct btrfs_root *root, unsigned long transid)
2310 {
2311         DEFINE_WAIT(wait);
2312         int index = transid % 2;
2313
2314         /*
2315          * we only allow two pending log transactions at a time,
2316          * so we know that if ours is more than 2 older than the
2317          * current transaction, we're done
2318          */
2319         do {
2320                 prepare_to_wait(&root->log_commit_wait[index],
2321                                 &wait, TASK_UNINTERRUPTIBLE);
2322                 mutex_unlock(&root->log_mutex);
2323
2324                 if (root->fs_info->last_trans_log_full_commit !=
2325                     trans->transid && root->log_transid < transid + 2 &&
2326                     atomic_read(&root->log_commit[index]))
2327                         schedule();
2328
2329                 finish_wait(&root->log_commit_wait[index], &wait);
2330                 mutex_lock(&root->log_mutex);
2331         } while (root->fs_info->last_trans_log_full_commit !=
2332                  trans->transid && root->log_transid < transid + 2 &&
2333                  atomic_read(&root->log_commit[index]));
2334         return 0;
2335 }
2336
2337 static void wait_for_writer(struct btrfs_trans_handle *trans,
2338                             struct btrfs_root *root)
2339 {
2340         DEFINE_WAIT(wait);
2341         while (root->fs_info->last_trans_log_full_commit !=
2342                trans->transid && atomic_read(&root->log_writers)) {
2343                 prepare_to_wait(&root->log_writer_wait,
2344                                 &wait, TASK_UNINTERRUPTIBLE);
2345                 mutex_unlock(&root->log_mutex);
2346                 if (root->fs_info->last_trans_log_full_commit !=
2347                     trans->transid && atomic_read(&root->log_writers))
2348                         schedule();
2349                 mutex_lock(&root->log_mutex);
2350                 finish_wait(&root->log_writer_wait, &wait);
2351         }
2352 }
2353
2354 /*
2355  * btrfs_sync_log does sends a given tree log down to the disk and
2356  * updates the super blocks to record it.  When this call is done,
2357  * you know that any inodes previously logged are safely on disk only
2358  * if it returns 0.
2359  *
2360  * Any other return value means you need to call btrfs_commit_transaction.
2361  * Some of the edge cases for fsyncing directories that have had unlinks
2362  * or renames done in the past mean that sometimes the only safe
2363  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2364  * that has happened.
2365  */
2366 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2367                    struct btrfs_root *root)
2368 {
2369         int index1;
2370         int index2;
2371         int mark;
2372         int ret;
2373         struct btrfs_root *log = root->log_root;
2374         struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2375         unsigned long log_transid = 0;
2376         struct blk_plug plug;
2377
2378         mutex_lock(&root->log_mutex);
2379         log_transid = root->log_transid;
2380         index1 = root->log_transid % 2;
2381         if (atomic_read(&root->log_commit[index1])) {
2382                 wait_log_commit(trans, root, root->log_transid);
2383                 mutex_unlock(&root->log_mutex);
2384                 return 0;
2385         }
2386         atomic_set(&root->log_commit[index1], 1);
2387
2388         /* wait for previous tree log sync to complete */
2389         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2390                 wait_log_commit(trans, root, root->log_transid - 1);
2391         while (1) {
2392                 int batch = atomic_read(&root->log_batch);
2393                 /* when we're on an ssd, just kick the log commit out */
2394                 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2395                         mutex_unlock(&root->log_mutex);
2396                         schedule_timeout_uninterruptible(1);
2397                         mutex_lock(&root->log_mutex);
2398                 }
2399                 wait_for_writer(trans, root);
2400                 if (batch == atomic_read(&root->log_batch))
2401                         break;
2402         }
2403
2404         /* bail out if we need to do a full commit */
2405         if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2406                 ret = -EAGAIN;
2407                 btrfs_free_logged_extents(log, log_transid);
2408                 mutex_unlock(&root->log_mutex);
2409                 goto out;
2410         }
2411
2412         if (log_transid % 2 == 0)
2413                 mark = EXTENT_DIRTY;
2414         else
2415                 mark = EXTENT_NEW;
2416
2417         /* we start IO on  all the marked extents here, but we don't actually
2418          * wait for them until later.
2419          */
2420         blk_start_plug(&plug);
2421         ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2422         if (ret) {
2423                 blk_finish_plug(&plug);
2424                 btrfs_abort_transaction(trans, root, ret);
2425                 btrfs_free_logged_extents(log, log_transid);
2426                 mutex_unlock(&root->log_mutex);
2427                 goto out;
2428         }
2429
2430         btrfs_set_root_node(&log->root_item, log->node);
2431
2432         root->log_transid++;
2433         log->log_transid = root->log_transid;
2434         root->log_start_pid = 0;
2435         smp_mb();
2436         /*
2437          * IO has been started, blocks of the log tree have WRITTEN flag set
2438          * in their headers. new modifications of the log will be written to
2439          * new positions. so it's safe to allow log writers to go in.
2440          */
2441         mutex_unlock(&root->log_mutex);
2442
2443         mutex_lock(&log_root_tree->log_mutex);
2444         atomic_inc(&log_root_tree->log_batch);
2445         atomic_inc(&log_root_tree->log_writers);
2446         mutex_unlock(&log_root_tree->log_mutex);
2447
2448         ret = update_log_root(trans, log);
2449
2450         mutex_lock(&log_root_tree->log_mutex);
2451         if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2452                 smp_mb();
2453                 if (waitqueue_active(&log_root_tree->log_writer_wait))
2454                         wake_up(&log_root_tree->log_writer_wait);
2455         }
2456
2457         if (ret) {
2458                 blk_finish_plug(&plug);
2459                 if (ret != -ENOSPC) {
2460                         btrfs_abort_transaction(trans, root, ret);
2461                         mutex_unlock(&log_root_tree->log_mutex);
2462                         goto out;
2463                 }
2464                 root->fs_info->last_trans_log_full_commit = trans->transid;
2465                 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2466                 btrfs_free_logged_extents(log, log_transid);
2467                 mutex_unlock(&log_root_tree->log_mutex);
2468                 ret = -EAGAIN;
2469                 goto out;
2470         }
2471
2472         index2 = log_root_tree->log_transid % 2;
2473         if (atomic_read(&log_root_tree->log_commit[index2])) {
2474                 blk_finish_plug(&plug);
2475                 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2476                 wait_log_commit(trans, log_root_tree,
2477                                 log_root_tree->log_transid);
2478                 btrfs_free_logged_extents(log, log_transid);
2479                 mutex_unlock(&log_root_tree->log_mutex);
2480                 ret = 0;
2481                 goto out;
2482         }
2483         atomic_set(&log_root_tree->log_commit[index2], 1);
2484
2485         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
2486                 wait_log_commit(trans, log_root_tree,
2487                                 log_root_tree->log_transid - 1);
2488         }
2489
2490         wait_for_writer(trans, log_root_tree);
2491
2492         /*
2493          * now that we've moved on to the tree of log tree roots,
2494          * check the full commit flag again
2495          */
2496         if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2497                 blk_finish_plug(&plug);
2498                 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2499                 btrfs_free_logged_extents(log, log_transid);
2500                 mutex_unlock(&log_root_tree->log_mutex);
2501                 ret = -EAGAIN;
2502                 goto out_wake_log_root;
2503         }
2504
2505         ret = btrfs_write_marked_extents(log_root_tree,
2506                                          &log_root_tree->dirty_log_pages,
2507                                          EXTENT_DIRTY | EXTENT_NEW);
2508         blk_finish_plug(&plug);
2509         if (ret) {
2510                 btrfs_abort_transaction(trans, root, ret);
2511                 btrfs_free_logged_extents(log, log_transid);
2512                 mutex_unlock(&log_root_tree->log_mutex);
2513                 goto out_wake_log_root;
2514         }
2515         btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2516         btrfs_wait_marked_extents(log_root_tree,
2517                                   &log_root_tree->dirty_log_pages,
2518                                   EXTENT_NEW | EXTENT_DIRTY);
2519         btrfs_wait_logged_extents(log, log_transid);
2520
2521         btrfs_set_super_log_root(root->fs_info->super_for_commit,
2522                                 log_root_tree->node->start);
2523         btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2524                                 btrfs_header_level(log_root_tree->node));
2525
2526         log_root_tree->log_transid++;
2527         smp_mb();
2528
2529         mutex_unlock(&log_root_tree->log_mutex);
2530
2531         /*
2532          * nobody else is going to jump in and write the the ctree
2533          * super here because the log_commit atomic below is protecting
2534          * us.  We must be called with a transaction handle pinning
2535          * the running transaction open, so a full commit can't hop
2536          * in and cause problems either.
2537          */
2538         btrfs_scrub_pause_super(root);
2539         ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2540         btrfs_scrub_continue_super(root);
2541         if (ret) {
2542                 btrfs_abort_transaction(trans, root, ret);
2543                 goto out_wake_log_root;
2544         }
2545
2546         mutex_lock(&root->log_mutex);
2547         if (root->last_log_commit < log_transid)
2548                 root->last_log_commit = log_transid;
2549         mutex_unlock(&root->log_mutex);
2550
2551 out_wake_log_root:
2552         atomic_set(&log_root_tree->log_commit[index2], 0);
2553         smp_mb();
2554         if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2555                 wake_up(&log_root_tree->log_commit_wait[index2]);
2556 out:
2557         atomic_set(&root->log_commit[index1], 0);
2558         smp_mb();
2559         if (waitqueue_active(&root->log_commit_wait[index1]))
2560                 wake_up(&root->log_commit_wait[index1]);
2561         return ret;
2562 }
2563
2564 static void free_log_tree(struct btrfs_trans_handle *trans,
2565                           struct btrfs_root *log)
2566 {
2567         int ret;
2568         u64 start;
2569         u64 end;
2570         struct walk_control wc = {
2571                 .free = 1,
2572                 .process_func = process_one_buffer
2573         };
2574
2575         if (trans) {
2576                 ret = walk_log_tree(trans, log, &wc);
2577
2578                 /* I don't think this can happen but just in case */
2579                 if (ret)
2580                         btrfs_abort_transaction(trans, log, ret);
2581         }
2582
2583         while (1) {
2584                 ret = find_first_extent_bit(&log->dirty_log_pages,
2585                                 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
2586                                 NULL);
2587                 if (ret)
2588                         break;
2589
2590                 clear_extent_bits(&log->dirty_log_pages, start, end,
2591                                   EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2592         }
2593
2594         /*
2595          * We may have short-circuited the log tree with the full commit logic
2596          * and left ordered extents on our list, so clear these out to keep us
2597          * from leaking inodes and memory.
2598          */
2599         btrfs_free_logged_extents(log, 0);
2600         btrfs_free_logged_extents(log, 1);
2601
2602         free_extent_buffer(log->node);
2603         kfree(log);
2604 }
2605
2606 /*
2607  * free all the extents used by the tree log.  This should be called
2608  * at commit time of the full transaction
2609  */
2610 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2611 {
2612         if (root->log_root) {
2613                 free_log_tree(trans, root->log_root);
2614                 root->log_root = NULL;
2615         }
2616         return 0;
2617 }
2618
2619 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2620                              struct btrfs_fs_info *fs_info)
2621 {
2622         if (fs_info->log_root_tree) {
2623                 free_log_tree(trans, fs_info->log_root_tree);
2624                 fs_info->log_root_tree = NULL;
2625         }
2626         return 0;
2627 }
2628
2629 /*
2630  * If both a file and directory are logged, and unlinks or renames are
2631  * mixed in, we have a few interesting corners:
2632  *
2633  * create file X in dir Y
2634  * link file X to X.link in dir Y
2635  * fsync file X
2636  * unlink file X but leave X.link
2637  * fsync dir Y
2638  *
2639  * After a crash we would expect only X.link to exist.  But file X
2640  * didn't get fsync'd again so the log has back refs for X and X.link.
2641  *
2642  * We solve this by removing directory entries and inode backrefs from the
2643  * log when a file that was logged in the current transaction is
2644  * unlinked.  Any later fsync will include the updated log entries, and
2645  * we'll be able to reconstruct the proper directory items from backrefs.
2646  *
2647  * This optimizations allows us to avoid relogging the entire inode
2648  * or the entire directory.
2649  */
2650 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2651                                  struct btrfs_root *root,
2652                                  const char *name, int name_len,
2653                                  struct inode *dir, u64 index)
2654 {
2655         struct btrfs_root *log;
2656         struct btrfs_dir_item *di;
2657         struct btrfs_path *path;
2658         int ret;
2659         int err = 0;
2660         int bytes_del = 0;
2661         u64 dir_ino = btrfs_ino(dir);
2662
2663         if (BTRFS_I(dir)->logged_trans < trans->transid)
2664                 return 0;
2665
2666         ret = join_running_log_trans(root);
2667         if (ret)
2668                 return 0;
2669
2670         mutex_lock(&BTRFS_I(dir)->log_mutex);
2671
2672         log = root->log_root;
2673         path = btrfs_alloc_path();
2674         if (!path) {
2675                 err = -ENOMEM;
2676                 goto out_unlock;
2677         }
2678
2679         di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
2680                                    name, name_len, -1);
2681         if (IS_ERR(di)) {
2682                 err = PTR_ERR(di);
2683                 goto fail;
2684         }
2685         if (di) {
2686                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2687                 bytes_del += name_len;
2688                 if (ret) {
2689                         err = ret;
2690                         goto fail;
2691                 }
2692         }
2693         btrfs_release_path(path);
2694         di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
2695                                          index, name, name_len, -1);
2696         if (IS_ERR(di)) {
2697                 err = PTR_ERR(di);
2698                 goto fail;
2699         }
2700         if (di) {
2701                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2702                 bytes_del += name_len;
2703                 if (ret) {
2704                         err = ret;
2705                         goto fail;
2706                 }
2707         }
2708
2709         /* update the directory size in the log to reflect the names
2710          * we have removed
2711          */
2712         if (bytes_del) {
2713                 struct btrfs_key key;
2714
2715                 key.objectid = dir_ino;
2716                 key.offset = 0;
2717                 key.type = BTRFS_INODE_ITEM_KEY;
2718                 btrfs_release_path(path);
2719
2720                 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2721                 if (ret < 0) {
2722                         err = ret;
2723                         goto fail;
2724                 }
2725                 if (ret == 0) {
2726                         struct btrfs_inode_item *item;
2727                         u64 i_size;
2728
2729                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2730                                               struct btrfs_inode_item);
2731                         i_size = btrfs_inode_size(path->nodes[0], item);
2732                         if (i_size > bytes_del)
2733                                 i_size -= bytes_del;
2734                         else
2735                                 i_size = 0;
2736                         btrfs_set_inode_size(path->nodes[0], item, i_size);
2737                         btrfs_mark_buffer_dirty(path->nodes[0]);
2738                 } else
2739                         ret = 0;
2740                 btrfs_release_path(path);
2741         }
2742 fail:
2743         btrfs_free_path(path);
2744 out_unlock:
2745         mutex_unlock(&BTRFS_I(dir)->log_mutex);
2746         if (ret == -ENOSPC) {
2747                 root->fs_info->last_trans_log_full_commit = trans->transid;
2748                 ret = 0;
2749         } else if (ret < 0)
2750                 btrfs_abort_transaction(trans, root, ret);
2751
2752         btrfs_end_log_trans(root);
2753
2754         return err;
2755 }
2756
2757 /* see comments for btrfs_del_dir_entries_in_log */
2758 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2759                                struct btrfs_root *root,
2760                                const char *name, int name_len,
2761                                struct inode *inode, u64 dirid)
2762 {
2763         struct btrfs_root *log;
2764         u64 index;
2765         int ret;
2766
2767         if (BTRFS_I(inode)->logged_trans < trans->transid)
2768                 return 0;
2769
2770         ret = join_running_log_trans(root);
2771         if (ret)
2772                 return 0;
2773         log = root->log_root;
2774         mutex_lock(&BTRFS_I(inode)->log_mutex);
2775
2776         ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
2777                                   dirid, &index);
2778         mutex_unlock(&BTRFS_I(inode)->log_mutex);
2779         if (ret == -ENOSPC) {
2780                 root->fs_info->last_trans_log_full_commit = trans->transid;
2781                 ret = 0;
2782         } else if (ret < 0 && ret != -ENOENT)
2783                 btrfs_abort_transaction(trans, root, ret);
2784         btrfs_end_log_trans(root);
2785
2786         return ret;
2787 }
2788
2789 /*
2790  * creates a range item in the log for 'dirid'.  first_offset and
2791  * last_offset tell us which parts of the key space the log should
2792  * be considered authoritative for.
2793  */
2794 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2795                                        struct btrfs_root *log,
2796                                        struct btrfs_path *path,
2797                                        int key_type, u64 dirid,
2798                                        u64 first_offset, u64 last_offset)
2799 {
2800         int ret;
2801         struct btrfs_key key;
2802         struct btrfs_dir_log_item *item;
2803
2804         key.objectid = dirid;
2805         key.offset = first_offset;
2806         if (key_type == BTRFS_DIR_ITEM_KEY)
2807                 key.type = BTRFS_DIR_LOG_ITEM_KEY;
2808         else
2809                 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2810         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2811         if (ret)
2812                 return ret;
2813
2814         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2815                               struct btrfs_dir_log_item);
2816         btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2817         btrfs_mark_buffer_dirty(path->nodes[0]);
2818         btrfs_release_path(path);
2819         return 0;
2820 }
2821
2822 /*
2823  * log all the items included in the current transaction for a given
2824  * directory.  This also creates the range items in the log tree required
2825  * to replay anything deleted before the fsync
2826  */
2827 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2828                           struct btrfs_root *root, struct inode *inode,
2829                           struct btrfs_path *path,
2830                           struct btrfs_path *dst_path, int key_type,
2831                           u64 min_offset, u64 *last_offset_ret)
2832 {
2833         struct btrfs_key min_key;
2834         struct btrfs_key max_key;
2835         struct btrfs_root *log = root->log_root;
2836         struct extent_buffer *src;
2837         int err = 0;
2838         int ret;
2839         int i;
2840         int nritems;
2841         u64 first_offset = min_offset;
2842         u64 last_offset = (u64)-1;
2843         u64 ino = btrfs_ino(inode);
2844
2845         log = root->log_root;
2846         max_key.objectid = ino;
2847         max_key.offset = (u64)-1;
2848         max_key.type = key_type;
2849
2850         min_key.objectid = ino;
2851         min_key.type = key_type;
2852         min_key.offset = min_offset;
2853
2854         path->keep_locks = 1;
2855
2856         ret = btrfs_search_forward(root, &min_key, &max_key,
2857                                    path, trans->transid);
2858
2859         /*
2860          * we didn't find anything from this transaction, see if there
2861          * is anything at all
2862          */
2863         if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
2864                 min_key.objectid = ino;
2865                 min_key.type = key_type;
2866                 min_key.offset = (u64)-1;
2867                 btrfs_release_path(path);
2868                 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2869                 if (ret < 0) {
2870                         btrfs_release_path(path);
2871                         return ret;
2872                 }
2873                 ret = btrfs_previous_item(root, path, ino, key_type);
2874
2875                 /* if ret == 0 there are items for this type,
2876                  * create a range to tell us the last key of this type.
2877                  * otherwise, there are no items in this directory after
2878                  * *min_offset, and we create a range to indicate that.
2879                  */
2880                 if (ret == 0) {
2881                         struct btrfs_key tmp;
2882                         btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2883                                               path->slots[0]);
2884                         if (key_type == tmp.type)
2885                                 first_offset = max(min_offset, tmp.offset) + 1;
2886                 }
2887                 goto done;
2888         }
2889
2890         /* go backward to find any previous key */
2891         ret = btrfs_previous_item(root, path, ino, key_type);
2892         if (ret == 0) {
2893                 struct btrfs_key tmp;
2894                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2895                 if (key_type == tmp.type) {
2896                         first_offset = tmp.offset;
2897                         ret = overwrite_item(trans, log, dst_path,
2898                                              path->nodes[0], path->slots[0],
2899                                              &tmp);
2900                         if (ret) {
2901                                 err = ret;
2902                                 goto done;
2903                         }
2904                 }
2905         }
2906         btrfs_release_path(path);
2907
2908         /* find the first key from this transaction again */
2909         ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2910         if (ret != 0) {
2911                 WARN_ON(1);
2912                 goto done;
2913         }
2914
2915         /*
2916          * we have a block from this transaction, log every item in it
2917          * from our directory
2918          */
2919         while (1) {
2920                 struct btrfs_key tmp;
2921                 src = path->nodes[0];
2922                 nritems = btrfs_header_nritems(src);
2923                 for (i = path->slots[0]; i < nritems; i++) {
2924                         btrfs_item_key_to_cpu(src, &min_key, i);
2925
2926                         if (min_key.objectid != ino || min_key.type != key_type)
2927                                 goto done;
2928                         ret = overwrite_item(trans, log, dst_path, src, i,
2929                                              &min_key);
2930                         if (ret) {
2931                                 err = ret;
2932                                 goto done;
2933                         }
2934                 }
2935                 path->slots[0] = nritems;
2936
2937                 /*
2938                  * look ahead to the next item and see if it is also
2939                  * from this directory and from this transaction
2940                  */
2941                 ret = btrfs_next_leaf(root, path);
2942                 if (ret == 1) {
2943                         last_offset = (u64)-1;
2944                         goto done;
2945                 }
2946                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2947                 if (tmp.objectid != ino || tmp.type != key_type) {
2948                         last_offset = (u64)-1;
2949                         goto done;
2950                 }
2951                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2952                         ret = overwrite_item(trans, log, dst_path,
2953                                              path->nodes[0], path->slots[0],
2954                                              &tmp);
2955                         if (ret)
2956                                 err = ret;
2957                         else
2958                                 last_offset = tmp.offset;
2959                         goto done;
2960                 }
2961         }
2962 done:
2963         btrfs_release_path(path);
2964         btrfs_release_path(dst_path);
2965
2966         if (err == 0) {
2967                 *last_offset_ret = last_offset;
2968                 /*
2969                  * insert the log range keys to indicate where the log
2970                  * is valid
2971                  */
2972                 ret = insert_dir_log_key(trans, log, path, key_type,
2973                                          ino, first_offset, last_offset);
2974                 if (ret)
2975                         err = ret;
2976         }
2977         return err;
2978 }
2979
2980 /*
2981  * logging directories is very similar to logging inodes, We find all the items
2982  * from the current transaction and write them to the log.
2983  *
2984  * The recovery code scans the directory in the subvolume, and if it finds a
2985  * key in the range logged that is not present in the log tree, then it means
2986  * that dir entry was unlinked during the transaction.
2987  *
2988  * In order for that scan to work, we must include one key smaller than
2989  * the smallest logged by this transaction and one key larger than the largest
2990  * key logged by this transaction.
2991  */
2992 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2993                           struct btrfs_root *root, struct inode *inode,
2994                           struct btrfs_path *path,
2995                           struct btrfs_path *dst_path)
2996 {
2997         u64 min_key;
2998         u64 max_key;
2999         int ret;
3000         int key_type = BTRFS_DIR_ITEM_KEY;
3001
3002 again:
3003         min_key = 0;
3004         max_key = 0;
3005         while (1) {
3006                 ret = log_dir_items(trans, root, inode, path,
3007                                     dst_path, key_type, min_key,
3008                                     &max_key);
3009                 if (ret)
3010                         return ret;
3011                 if (max_key == (u64)-1)
3012                         break;
3013                 min_key = max_key + 1;
3014         }
3015
3016         if (key_type == BTRFS_DIR_ITEM_KEY) {
3017                 key_type = BTRFS_DIR_INDEX_KEY;
3018                 goto again;
3019         }
3020         return 0;
3021 }
3022
3023 /*
3024  * a helper function to drop items from the log before we relog an
3025  * inode.  max_key_type indicates the highest item type to remove.
3026  * This cannot be run for file data extents because it does not
3027  * free the extents they point to.
3028  */
3029 static int drop_objectid_items(struct btrfs_trans_handle *trans,
3030                                   struct btrfs_root *log,
3031                                   struct btrfs_path *path,
3032                                   u64 objectid, int max_key_type)
3033 {
3034         int ret;
3035         struct btrfs_key key;
3036         struct btrfs_key found_key;
3037         int start_slot;
3038
3039         key.objectid = objectid;
3040         key.type = max_key_type;
3041         key.offset = (u64)-1;
3042
3043         while (1) {
3044                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3045                 BUG_ON(ret == 0); /* Logic error */
3046                 if (ret < 0)
3047                         break;
3048
3049                 if (path->slots[0] == 0)
3050                         break;
3051
3052                 path->slots[0]--;
3053                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3054                                       path->slots[0]);
3055
3056                 if (found_key.objectid != objectid)
3057                         break;
3058
3059                 found_key.offset = 0;
3060                 found_key.type = 0;
3061                 ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3062                                        &start_slot);
3063
3064                 ret = btrfs_del_items(trans, log, path, start_slot,
3065                                       path->slots[0] - start_slot + 1);
3066                 /*
3067                  * If start slot isn't 0 then we don't need to re-search, we've
3068                  * found the last guy with the objectid in this tree.
3069                  */
3070                 if (ret || start_slot != 0)
3071                         break;
3072                 btrfs_release_path(path);
3073         }
3074         btrfs_release_path(path);
3075         if (ret > 0)
3076                 ret = 0;
3077         return ret;
3078 }
3079
3080 static void fill_inode_item(struct btrfs_trans_handle *trans,
3081                             struct extent_buffer *leaf,
3082                             struct btrfs_inode_item *item,
3083                             struct inode *inode, int log_inode_only)
3084 {
3085         struct btrfs_map_token token;
3086
3087         btrfs_init_map_token(&token);
3088
3089         if (log_inode_only) {
3090                 /* set the generation to zero so the recover code
3091                  * can tell the difference between an logging
3092                  * just to say 'this inode exists' and a logging
3093                  * to say 'update this inode with these values'
3094                  */
3095                 btrfs_set_token_inode_generation(leaf, item, 0, &token);
3096                 btrfs_set_token_inode_size(leaf, item, 0, &token);
3097         } else {
3098                 btrfs_set_token_inode_generation(leaf, item,
3099                                                  BTRFS_I(inode)->generation,
3100                                                  &token);
3101                 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
3102         }
3103
3104         btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3105         btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3106         btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3107         btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3108
3109         btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
3110                                      inode->i_atime.tv_sec, &token);
3111         btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
3112                                       inode->i_atime.tv_nsec, &token);
3113
3114         btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
3115                                      inode->i_mtime.tv_sec, &token);
3116         btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
3117                                       inode->i_mtime.tv_nsec, &token);
3118
3119         btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
3120                                      inode->i_ctime.tv_sec, &token);
3121         btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3122                                       inode->i_ctime.tv_nsec, &token);
3123
3124         btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3125                                      &token);
3126
3127         btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3128         btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3129         btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3130         btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3131         btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3132 }
3133
3134 static int log_inode_item(struct btrfs_trans_handle *trans,
3135                           struct btrfs_root *log, struct btrfs_path *path,
3136                           struct inode *inode)
3137 {
3138         struct btrfs_inode_item *inode_item;
3139         struct btrfs_key key;
3140         int ret;
3141
3142         memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
3143         ret = btrfs_insert_empty_item(trans, log, path, &key,
3144                                       sizeof(*inode_item));
3145         if (ret && ret != -EEXIST)
3146                 return ret;
3147         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3148                                     struct btrfs_inode_item);
3149         fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
3150         btrfs_release_path(path);
3151         return 0;
3152 }
3153
3154 static noinline int copy_items(struct btrfs_trans_handle *trans,
3155                                struct inode *inode,
3156                                struct btrfs_path *dst_path,
3157                                struct extent_buffer *src,
3158                                int start_slot, int nr, int inode_only)
3159 {
3160         unsigned long src_offset;
3161         unsigned long dst_offset;
3162         struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
3163         struct btrfs_file_extent_item *extent;
3164         struct btrfs_inode_item *inode_item;
3165         int ret;
3166         struct btrfs_key *ins_keys;
3167         u32 *ins_sizes;
3168         char *ins_data;
3169         int i;
3170         struct list_head ordered_sums;
3171         int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3172
3173         INIT_LIST_HEAD(&ordered_sums);
3174
3175         ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3176                            nr * sizeof(u32), GFP_NOFS);
3177         if (!ins_data)
3178                 return -ENOMEM;
3179
3180         ins_sizes = (u32 *)ins_data;
3181         ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3182
3183         for (i = 0; i < nr; i++) {
3184                 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3185                 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3186         }
3187         ret = btrfs_insert_empty_items(trans, log, dst_path,
3188                                        ins_keys, ins_sizes, nr);
3189         if (ret) {
3190                 kfree(ins_data);
3191                 return ret;
3192         }
3193
3194         for (i = 0; i < nr; i++, dst_path->slots[0]++) {
3195                 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3196                                                    dst_path->slots[0]);
3197
3198                 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3199
3200                 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3201                         inode_item = btrfs_item_ptr(dst_path->nodes[0],
3202                                                     dst_path->slots[0],
3203                                                     struct btrfs_inode_item);
3204                         fill_inode_item(trans, dst_path->nodes[0], inode_item,
3205                                         inode, inode_only == LOG_INODE_EXISTS);
3206                 } else {
3207                         copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3208                                            src_offset, ins_sizes[i]);
3209                 }
3210
3211                 /* take a reference on file data extents so that truncates
3212                  * or deletes of this inode don't have to relog the inode
3213                  * again
3214                  */
3215                 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
3216                     !skip_csum) {
3217                         int found_type;
3218                         extent = btrfs_item_ptr(src, start_slot + i,
3219                                                 struct btrfs_file_extent_item);
3220
3221                         if (btrfs_file_extent_generation(src, extent) < trans->transid)
3222                                 continue;
3223
3224                         found_type = btrfs_file_extent_type(src, extent);
3225                         if (found_type == BTRFS_FILE_EXTENT_REG) {
3226                                 u64 ds, dl, cs, cl;
3227                                 ds = btrfs_file_extent_disk_bytenr(src,
3228                                                                 extent);
3229                                 /* ds == 0 is a hole */
3230                                 if (ds == 0)
3231                                         continue;
3232
3233                                 dl = btrfs_file_extent_disk_num_bytes(src,
3234                                                                 extent);
3235                                 cs = btrfs_file_extent_offset(src, extent);
3236                                 cl = btrfs_file_extent_num_bytes(src,
3237                                                                 extent);
3238                                 if (btrfs_file_extent_compression(src,
3239                                                                   extent)) {
3240                                         cs = 0;
3241                                         cl = dl;
3242                                 }
3243
3244                                 ret = btrfs_lookup_csums_range(
3245                                                 log->fs_info->csum_root,
3246                                                 ds + cs, ds + cs + cl - 1,
3247                                                 &ordered_sums, 0);
3248                                 if (ret) {
3249                                         btrfs_release_path(dst_path);
3250                                         kfree(ins_data);
3251                                         return ret;
3252                                 }
3253                         }
3254                 }
3255         }
3256
3257         btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3258         btrfs_release_path(dst_path);
3259         kfree(ins_data);
3260
3261         /*
3262          * we have to do this after the loop above to avoid changing the
3263          * log tree while trying to change the log tree.
3264          */
3265         ret = 0;
3266         while (!list_empty(&ordered_sums)) {
3267                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3268                                                    struct btrfs_ordered_sum,
3269                                                    list);
3270                 if (!ret)
3271                         ret = btrfs_csum_file_blocks(trans, log, sums);
3272                 list_del(&sums->list);
3273                 kfree(sums);
3274         }
3275         return ret;
3276 }
3277
3278 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3279 {
3280         struct extent_map *em1, *em2;
3281
3282         em1 = list_entry(a, struct extent_map, list);
3283         em2 = list_entry(b, struct extent_map, list);
3284
3285         if (em1->start < em2->start)
3286                 return -1;
3287         else if (em1->start > em2->start)
3288                 return 1;
3289         return 0;
3290 }
3291
3292 static int log_one_extent(struct btrfs_trans_handle *trans,
3293                           struct inode *inode, struct btrfs_root *root,
3294                           struct extent_map *em, struct btrfs_path *path)
3295 {
3296         struct btrfs_root *log = root->log_root;
3297         struct btrfs_file_extent_item *fi;
3298         struct extent_buffer *leaf;
3299         struct btrfs_ordered_extent *ordered;
3300         struct list_head ordered_sums;
3301         struct btrfs_map_token token;
3302         struct btrfs_key key;
3303         u64 mod_start = em->mod_start;
3304         u64 mod_len = em->mod_len;
3305         u64 csum_offset;
3306         u64 csum_len;
3307         u64 extent_offset = em->start - em->orig_start;
3308         u64 block_len;
3309         int ret;
3310         int index = log->log_transid % 2;
3311         bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3312
3313         ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3314                                    em->start + em->len, NULL, 0);
3315         if (ret)
3316                 return ret;
3317
3318         INIT_LIST_HEAD(&ordered_sums);
3319         btrfs_init_map_token(&token);
3320         key.objectid = btrfs_ino(inode);
3321         key.type = BTRFS_EXTENT_DATA_KEY;
3322         key.offset = em->start;
3323
3324         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
3325         if (ret)
3326                 return ret;
3327         leaf = path->nodes[0];
3328         fi = btrfs_item_ptr(leaf, path->slots[0],
3329                             struct btrfs_file_extent_item);
3330
3331         btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3332                                                &token);
3333         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3334                 skip_csum = true;
3335                 btrfs_set_token_file_extent_type(leaf, fi,
3336                                                  BTRFS_FILE_EXTENT_PREALLOC,
3337                                                  &token);
3338         } else {
3339                 btrfs_set_token_file_extent_type(leaf, fi,
3340                                                  BTRFS_FILE_EXTENT_REG,
3341                                                  &token);
3342                 if (em->block_start == 0)
3343                         skip_csum = true;
3344         }
3345
3346         block_len = max(em->block_len, em->orig_block_len);
3347         if (em->compress_type != BTRFS_COMPRESS_NONE) {
3348                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3349                                                         em->block_start,
3350                                                         &token);
3351                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3352                                                            &token);
3353         } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3354                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3355                                                         em->block_start -
3356                                                         extent_offset, &token);
3357                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3358                                                            &token);
3359         } else {
3360                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3361                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3362                                                            &token);
3363         }
3364
3365         btrfs_set_token_file_extent_offset(leaf, fi,
3366                                            em->start - em->orig_start,
3367                                            &token);
3368         btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3369         btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
3370         btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3371                                                 &token);
3372         btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3373         btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3374         btrfs_mark_buffer_dirty(leaf);
3375
3376         btrfs_release_path(path);
3377         if (ret) {
3378                 return ret;
3379         }
3380
3381         if (skip_csum)
3382                 return 0;
3383
3384         if (em->compress_type) {
3385                 csum_offset = 0;
3386                 csum_len = block_len;
3387         }
3388
3389         /*
3390          * First check and see if our csums are on our outstanding ordered
3391          * extents.
3392          */
3393 again:
3394         spin_lock_irq(&log->log_extents_lock[index]);
3395         list_for_each_entry(ordered, &log->logged_list[index], log_list) {
3396                 struct btrfs_ordered_sum *sum;
3397
3398                 if (!mod_len)
3399                         break;
3400
3401                 if (ordered->inode != inode)
3402                         continue;
3403
3404                 if (ordered->file_offset + ordered->len <= mod_start ||
3405                     mod_start + mod_len <= ordered->file_offset)
3406                         continue;
3407
3408                 /*
3409                  * We are going to copy all the csums on this ordered extent, so
3410                  * go ahead and adjust mod_start and mod_len in case this
3411                  * ordered extent has already been logged.
3412                  */
3413                 if (ordered->file_offset > mod_start) {
3414                         if (ordered->file_offset + ordered->len >=
3415                             mod_start + mod_len)
3416                                 mod_len = ordered->file_offset - mod_start;
3417                         /*
3418                          * If we have this case
3419                          *
3420                          * |--------- logged extent ---------|
3421                          *       |----- ordered extent ----|
3422                          *
3423                          * Just don't mess with mod_start and mod_len, we'll
3424                          * just end up logging more csums than we need and it
3425                          * will be ok.
3426                          */
3427                 } else {
3428                         if (ordered->file_offset + ordered->len <
3429                             mod_start + mod_len) {
3430                                 mod_len = (mod_start + mod_len) -
3431                                         (ordered->file_offset + ordered->len);
3432                                 mod_start = ordered->file_offset +
3433                                         ordered->len;
3434                         } else {
3435                                 mod_len = 0;
3436                         }
3437                 }
3438
3439                 /*
3440                  * To keep us from looping for the above case of an ordered
3441                  * extent that falls inside of the logged extent.
3442                  */
3443                 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
3444                                      &ordered->flags))
3445                         continue;
3446                 atomic_inc(&ordered->refs);
3447                 spin_unlock_irq(&log->log_extents_lock[index]);
3448                 /*
3449                  * we've dropped the lock, we must either break or
3450                  * start over after this.
3451                  */
3452
3453                 wait_event(ordered->wait, ordered->csum_bytes_left == 0);
3454
3455                 list_for_each_entry(sum, &ordered->list, list) {
3456                         ret = btrfs_csum_file_blocks(trans, log, sum);
3457                         if (ret) {
3458                                 btrfs_put_ordered_extent(ordered);
3459                                 goto unlocked;
3460                         }
3461                 }
3462                 btrfs_put_ordered_extent(ordered);
3463                 goto again;
3464
3465         }
3466         spin_unlock_irq(&log->log_extents_lock[index]);
3467 unlocked:
3468
3469         if (!mod_len || ret)
3470                 return ret;
3471
3472         csum_offset = mod_start - em->start;
3473         csum_len = mod_len;
3474
3475         /* block start is already adjusted for the file extent offset. */
3476         ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3477                                        em->block_start + csum_offset,
3478                                        em->block_start + csum_offset +
3479                                        csum_len - 1, &ordered_sums, 0);
3480         if (ret)
3481                 return ret;
3482
3483         while (!list_empty(&ordered_sums)) {
3484                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3485                                                    struct btrfs_ordered_sum,
3486                                                    list);
3487                 if (!ret)
3488                         ret = btrfs_csum_file_blocks(trans, log, sums);
3489                 list_del(&sums->list);
3490                 kfree(sums);
3491         }
3492
3493         return ret;
3494 }
3495
3496 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3497                                      struct btrfs_root *root,
3498                                      struct inode *inode,
3499                                      struct btrfs_path *path)
3500 {
3501         struct extent_map *em, *n;
3502         struct list_head extents;
3503         struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3504         u64 test_gen;
3505         int ret = 0;
3506         int num = 0;
3507
3508         INIT_LIST_HEAD(&extents);
3509
3510         write_lock(&tree->lock);
3511         test_gen = root->fs_info->last_trans_committed;
3512
3513         list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
3514                 list_del_init(&em->list);
3515
3516                 /*
3517                  * Just an arbitrary number, this can be really CPU intensive
3518                  * once we start getting a lot of extents, and really once we
3519                  * have a bunch of extents we just want to commit since it will
3520                  * be faster.
3521                  */
3522                 if (++num > 32768) {
3523                         list_del_init(&tree->modified_extents);
3524                         ret = -EFBIG;
3525                         goto process;
3526                 }
3527
3528                 if (em->generation <= test_gen)
3529                         continue;
3530                 /* Need a ref to keep it from getting evicted from cache */
3531                 atomic_inc(&em->refs);
3532                 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
3533                 list_add_tail(&em->list, &extents);
3534                 num++;
3535         }
3536
3537         list_sort(NULL, &extents, extent_cmp);
3538
3539 process:
3540         while (!list_empty(&extents)) {
3541                 em = list_entry(extents.next, struct extent_map, list);
3542
3543                 list_del_init(&em->list);
3544
3545                 /*
3546                  * If we had an error we just need to delete everybody from our
3547                  * private list.
3548                  */
3549                 if (ret) {
3550                         clear_em_logging(tree, em);
3551                         free_extent_map(em);
3552                         continue;
3553                 }
3554
3555                 write_unlock(&tree->lock);
3556
3557                 ret = log_one_extent(trans, inode, root, em, path);
3558                 write_lock(&tree->lock);
3559                 clear_em_logging(tree, em);
3560                 free_extent_map(em);
3561         }
3562         WARN_ON(!list_empty(&extents));
3563         write_unlock(&tree->lock);
3564
3565         btrfs_release_path(path);
3566         return ret;
3567 }
3568
3569 /* log a single inode in the tree log.
3570  * At least one parent directory for this inode must exist in the tree
3571  * or be logged already.
3572  *
3573  * Any items from this inode changed by the current transaction are copied
3574  * to the log tree.  An extra reference is taken on any extents in this
3575  * file, allowing us to avoid a whole pile of corner cases around logging
3576  * blocks that have been removed from the tree.
3577  *
3578  * See LOG_INODE_ALL and related defines for a description of what inode_only
3579  * does.
3580  *
3581  * This handles both files and directories.
3582  */
3583 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3584                              struct btrfs_root *root, struct inode *inode,
3585                              int inode_only)
3586 {
3587         struct btrfs_path *path;
3588         struct btrfs_path *dst_path;
3589         struct btrfs_key min_key;
3590         struct btrfs_key max_key;
3591         struct btrfs_root *log = root->log_root;
3592         struct extent_buffer *src = NULL;
3593         int err = 0;
3594         int ret;
3595         int nritems;
3596         int ins_start_slot = 0;
3597         int ins_nr;
3598         bool fast_search = false;
3599         u64 ino = btrfs_ino(inode);
3600
3601         path = btrfs_alloc_path();
3602         if (!path)
3603                 return -ENOMEM;
3604         dst_path = btrfs_alloc_path();
3605         if (!dst_path) {
3606                 btrfs_free_path(path);
3607                 return -ENOMEM;
3608         }
3609
3610         min_key.objectid = ino;
3611         min_key.type = BTRFS_INODE_ITEM_KEY;
3612         min_key.offset = 0;
3613
3614         max_key.objectid = ino;
3615
3616
3617         /* today the code can only do partial logging of directories */
3618         if (S_ISDIR(inode->i_mode) ||
3619             (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3620                        &BTRFS_I(inode)->runtime_flags) &&
3621              inode_only == LOG_INODE_EXISTS))
3622                 max_key.type = BTRFS_XATTR_ITEM_KEY;
3623         else
3624                 max_key.type = (u8)-1;
3625         max_key.offset = (u64)-1;
3626
3627         /* Only run delayed items if we are a dir or a new file */
3628         if (S_ISDIR(inode->i_mode) ||
3629             BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
3630                 ret = btrfs_commit_inode_delayed_items(trans, inode);
3631                 if (ret) {
3632                         btrfs_free_path(path);
3633                         btrfs_free_path(dst_path);
3634                         return ret;
3635                 }
3636         }
3637
3638         mutex_lock(&BTRFS_I(inode)->log_mutex);
3639
3640         btrfs_get_logged_extents(log, inode);
3641
3642         /*
3643          * a brute force approach to making sure we get the most uptodate
3644          * copies of everything.
3645          */
3646         if (S_ISDIR(inode->i_mode)) {
3647                 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
3648
3649                 if (inode_only == LOG_INODE_EXISTS)
3650                         max_key_type = BTRFS_XATTR_ITEM_KEY;
3651                 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
3652         } else {
3653                 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3654                                        &BTRFS_I(inode)->runtime_flags)) {
3655                         clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3656                                   &BTRFS_I(inode)->runtime_flags);
3657                         ret = btrfs_truncate_inode_items(trans, log,
3658                                                          inode, 0, 0);
3659                 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3660                                               &BTRFS_I(inode)->runtime_flags)) {
3661                         if (inode_only == LOG_INODE_ALL)
3662                                 fast_search = true;
3663                         max_key.type = BTRFS_XATTR_ITEM_KEY;
3664                         ret = drop_objectid_items(trans, log, path, ino,
3665                                                   max_key.type);
3666                 } else {
3667                         if (inode_only == LOG_INODE_ALL)
3668                                 fast_search = true;
3669                         ret = log_inode_item(trans, log, dst_path, inode);
3670                         if (ret) {
3671                                 err = ret;
3672                                 goto out_unlock;
3673                         }
3674                         goto log_extents;
3675                 }
3676
3677         }
3678         if (ret) {
3679                 err = ret;
3680                 goto out_unlock;
3681         }
3682         path->keep_locks = 1;
3683
3684         while (1) {
3685                 ins_nr = 0;
3686                 ret = btrfs_search_forward(root, &min_key, &max_key,
3687                                            path, trans->transid);
3688                 if (ret != 0)
3689                         break;
3690 again:
3691                 /* note, ins_nr might be > 0 here, cleanup outside the loop */
3692                 if (min_key.objectid != ino)
3693                         break;
3694                 if (min_key.type > max_key.type)
3695                         break;
3696
3697                 src = path->nodes[0];
3698                 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
3699                         ins_nr++;
3700                         goto next_slot;
3701                 } else if (!ins_nr) {
3702                         ins_start_slot = path->slots[0];
3703                         ins_nr = 1;
3704                         goto next_slot;
3705                 }
3706
3707                 ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
3708                                  ins_nr, inode_only);
3709                 if (ret) {
3710                         err = ret;
3711                         goto out_unlock;
3712                 }
3713                 ins_nr = 1;
3714                 ins_start_slot = path->slots[0];
3715 next_slot:
3716
3717                 nritems = btrfs_header_nritems(path->nodes[0]);
3718                 path->slots[0]++;
3719                 if (path->slots[0] < nritems) {
3720                         btrfs_item_key_to_cpu(path->nodes[0], &min_key,
3721                                               path->slots[0]);
3722                         goto again;
3723                 }
3724                 if (ins_nr) {
3725                         ret = copy_items(trans, inode, dst_path, src,
3726                                          ins_start_slot,
3727                                          ins_nr, inode_only);
3728                         if (ret) {
3729                                 err = ret;
3730                                 goto out_unlock;
3731                         }
3732                         ins_nr = 0;
3733                 }
3734                 btrfs_release_path(path);
3735
3736                 if (min_key.offset < (u64)-1)
3737                         min_key.offset++;
3738                 else if (min_key.type < (u8)-1)
3739                         min_key.type++;
3740                 else if (min_key.objectid < (u64)-1)
3741                         min_key.objectid++;
3742                 else
3743                         break;
3744         }
3745         if (ins_nr) {
3746                 ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
3747                                  ins_nr, inode_only);
3748                 if (ret) {
3749                         err = ret;
3750                         goto out_unlock;
3751                 }
3752                 ins_nr = 0;
3753         }
3754
3755 log_extents:
3756         btrfs_release_path(path);
3757         btrfs_release_path(dst_path);
3758         if (fast_search) {
3759                 ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
3760                 if (ret) {
3761                         err = ret;
3762                         goto out_unlock;
3763                 }
3764         } else {
3765                 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3766                 struct extent_map *em, *n;
3767
3768                 write_lock(&tree->lock);
3769                 list_for_each_entry_safe(em, n, &tree->modified_extents, list)
3770                         list_del_init(&em->list);
3771                 write_unlock(&tree->lock);
3772         }
3773
3774         if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
3775                 ret = log_directory_changes(trans, root, inode, path, dst_path);
3776                 if (ret) {
3777                         err = ret;
3778                         goto out_unlock;
3779                 }
3780         }
3781         BTRFS_I(inode)->logged_trans = trans->transid;
3782         BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
3783 out_unlock:
3784         if (err)
3785                 btrfs_free_logged_extents(log, log->log_transid);
3786         mutex_unlock(&BTRFS_I(inode)->log_mutex);
3787
3788         btrfs_free_path(path);
3789         btrfs_free_path(dst_path);
3790         return err;
3791 }
3792
3793 /*
3794  * follow the dentry parent pointers up the chain and see if any
3795  * of the directories in it require a full commit before they can
3796  * be logged.  Returns zero if nothing special needs to be done or 1 if
3797  * a full commit is required.
3798  */
3799 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
3800                                                struct inode *inode,
3801                                                struct dentry *parent,
3802                                                struct super_block *sb,
3803                                                u64 last_committed)
3804 {
3805         int ret = 0;
3806         struct btrfs_root *root;
3807         struct dentry *old_parent = NULL;
3808
3809         /*
3810          * for regular files, if its inode is already on disk, we don't
3811          * have to worry about the parents at all.  This is because
3812          * we can use the last_unlink_trans field to record renames
3813          * and other fun in this file.
3814          */
3815         if (S_ISREG(inode->i_mode) &&
3816             BTRFS_I(inode)->generation <= last_committed &&
3817             BTRFS_I(inode)->last_unlink_trans <= last_committed)
3818                         goto out;
3819
3820         if (!S_ISDIR(inode->i_mode)) {
3821                 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3822                         goto out;
3823                 inode = parent->d_inode;
3824         }
3825
3826         while (1) {
3827                 BTRFS_I(inode)->logged_trans = trans->transid;
3828                 smp_mb();
3829
3830                 if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
3831                         root = BTRFS_I(inode)->root;
3832
3833                         /*
3834                          * make sure any commits to the log are forced
3835                          * to be full commits
3836                          */
3837                         root->fs_info->last_trans_log_full_commit =
3838                                 trans->transid;
3839                         ret = 1;
3840                         break;
3841                 }
3842
3843                 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3844                         break;
3845
3846                 if (IS_ROOT(parent))
3847                         break;
3848
3849                 parent = dget_parent(parent);
3850                 dput(old_parent);
3851                 old_parent = parent;
3852                 inode = parent->d_inode;
3853
3854         }
3855         dput(old_parent);
3856 out:
3857         return ret;
3858 }
3859
3860 /*
3861  * helper function around btrfs_log_inode to make sure newly created
3862  * parent directories also end up in the log.  A minimal inode and backref
3863  * only logging is done of any parent directories that are older than
3864  * the last committed transaction
3865  */
3866 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3867                                   struct btrfs_root *root, struct inode *inode,
3868                                   struct dentry *parent, int exists_only)
3869 {
3870         int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
3871         struct super_block *sb;
3872         struct dentry *old_parent = NULL;
3873         int ret = 0;
3874         u64 last_committed = root->fs_info->last_trans_committed;
3875
3876         sb = inode->i_sb;
3877
3878         if (btrfs_test_opt(root, NOTREELOG)) {
3879                 ret = 1;
3880                 goto end_no_trans;
3881         }
3882
3883         if (root->fs_info->last_trans_log_full_commit >
3884             root->fs_info->last_trans_committed) {
3885                 ret = 1;
3886                 goto end_no_trans;
3887         }
3888
3889         if (root != BTRFS_I(inode)->root ||
3890             btrfs_root_refs(&root->root_item) == 0) {
3891                 ret = 1;
3892                 goto end_no_trans;
3893         }
3894
3895         ret = check_parent_dirs_for_sync(trans, inode, parent,
3896                                          sb, last_committed);
3897         if (ret)
3898                 goto end_no_trans;
3899
3900         if (btrfs_inode_in_log(inode, trans->transid)) {
3901                 ret = BTRFS_NO_LOG_SYNC;
3902                 goto end_no_trans;
3903         }
3904
3905         ret = start_log_trans(trans, root);
3906         if (ret)
3907                 goto end_trans;
3908
3909         ret = btrfs_log_inode(trans, root, inode, inode_only);
3910         if (ret)
3911                 goto end_trans;
3912
3913         /*
3914          * for regular files, if its inode is already on disk, we don't
3915          * have to worry about the parents at all.  This is because
3916          * we can use the last_unlink_trans field to record renames
3917          * and other fun in this file.
3918          */
3919         if (S_ISREG(inode->i_mode) &&
3920             BTRFS_I(inode)->generation <= last_committed &&
3921             BTRFS_I(inode)->last_unlink_trans <= last_committed) {
3922                 ret = 0;
3923                 goto end_trans;
3924         }
3925
3926         inode_only = LOG_INODE_EXISTS;
3927         while (1) {
3928                 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
3929                         break;
3930
3931                 inode = parent->d_inode;
3932                 if (root != BTRFS_I(inode)->root)
3933                         break;
3934
3935                 if (BTRFS_I(inode)->generation >
3936                     root->fs_info->last_trans_committed) {
3937                         ret = btrfs_log_inode(trans, root, inode, inode_only);
3938                         if (ret)
3939                                 goto end_trans;
3940                 }
3941                 if (IS_ROOT(parent))
3942                         break;
3943
3944                 parent = dget_parent(parent);
3945                 dput(old_parent);
3946                 old_parent = parent;
3947         }
3948         ret = 0;
3949 end_trans:
3950         dput(old_parent);
3951         if (ret < 0) {
3952                 root->fs_info->last_trans_log_full_commit = trans->transid;
3953                 ret = 1;
3954         }
3955         btrfs_end_log_trans(root);
3956 end_no_trans:
3957         return ret;
3958 }
3959
3960 /*
3961  * it is not safe to log dentry if the chunk root has added new
3962  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
3963  * If this returns 1, you must commit the transaction to safely get your
3964  * data on disk.
3965  */
3966 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
3967                           struct btrfs_root *root, struct dentry *dentry)
3968 {
3969         struct dentry *parent = dget_parent(dentry);
3970         int ret;
3971
3972         ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
3973         dput(parent);
3974
3975         return ret;
3976 }
3977
3978 /*
3979  * should be called during mount to recover any replay any log trees
3980  * from the FS
3981  */
3982 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3983 {
3984         int ret;
3985         struct btrfs_path *path;
3986         struct btrfs_trans_handle *trans;
3987         struct btrfs_key key;
3988         struct btrfs_key found_key;
3989         struct btrfs_key tmp_key;
3990         struct btrfs_root *log;
3991         struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
3992         struct walk_control wc = {
3993                 .process_func = process_one_buffer,
3994                 .stage = 0,
3995         };
3996
3997         path = btrfs_alloc_path();
3998         if (!path)
3999                 return -ENOMEM;
4000
4001         fs_info->log_root_recovering = 1;
4002
4003         trans = btrfs_start_transaction(fs_info->tree_root, 0);
4004         if (IS_ERR(trans)) {
4005                 ret = PTR_ERR(trans);
4006                 goto error;
4007         }
4008
4009         wc.trans = trans;
4010         wc.pin = 1;
4011
4012         ret = walk_log_tree(trans, log_root_tree, &wc);
4013         if (ret) {
4014                 btrfs_error(fs_info, ret, "Failed to pin buffers while "
4015                             "recovering log root tree.");
4016                 goto error;
4017         }
4018
4019 again:
4020         key.objectid = BTRFS_TREE_LOG_OBJECTID;
4021         key.offset = (u64)-1;
4022         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
4023
4024         while (1) {
4025                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
4026
4027                 if (ret < 0) {
4028                         btrfs_error(fs_info, ret,
4029                                     "Couldn't find tree log root.");
4030                         goto error;
4031                 }
4032                 if (ret > 0) {
4033                         if (path->slots[0] == 0)
4034                                 break;
4035                         path->slots[0]--;
4036                 }
4037                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4038                                       path->slots[0]);
4039                 btrfs_release_path(path);
4040                 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4041                         break;
4042
4043                 log = btrfs_read_fs_root(log_root_tree, &found_key);
4044                 if (IS_ERR(log)) {
4045                         ret = PTR_ERR(log);
4046                         btrfs_error(fs_info, ret,
4047                                     "Couldn't read tree log root.");
4048                         goto error;
4049                 }
4050
4051                 tmp_key.objectid = found_key.offset;
4052                 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
4053                 tmp_key.offset = (u64)-1;
4054
4055                 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
4056                 if (IS_ERR(wc.replay_dest)) {
4057                         ret = PTR_ERR(wc.replay_dest);
4058                         free_extent_buffer(log->node);
4059                         free_extent_buffer(log->commit_root);
4060                         kfree(log);
4061                         btrfs_error(fs_info, ret, "Couldn't read target root "
4062                                     "for tree log recovery.");
4063                         goto error;
4064                 }
4065
4066                 wc.replay_dest->log_root = log;
4067                 btrfs_record_root_in_trans(trans, wc.replay_dest);
4068                 ret = walk_log_tree(trans, log, &wc);
4069
4070                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
4071                         ret = fixup_inode_link_counts(trans, wc.replay_dest,
4072                                                       path);
4073                 }
4074
4075                 key.offset = found_key.offset - 1;
4076                 wc.replay_dest->log_root = NULL;
4077                 free_extent_buffer(log->node);
4078                 free_extent_buffer(log->commit_root);
4079                 kfree(log);
4080
4081                 if (ret)
4082                         goto error;
4083
4084                 if (found_key.offset == 0)
4085                         break;
4086         }
4087         btrfs_release_path(path);
4088
4089         /* step one is to pin it all, step two is to replay just inodes */
4090         if (wc.pin) {
4091                 wc.pin = 0;
4092                 wc.process_func = replay_one_buffer;
4093                 wc.stage = LOG_WALK_REPLAY_INODES;
4094                 goto again;
4095         }
4096         /* step three is to replay everything */
4097         if (wc.stage < LOG_WALK_REPLAY_ALL) {
4098                 wc.stage++;
4099                 goto again;
4100         }
4101
4102         btrfs_free_path(path);
4103
4104         /* step 4: commit the transaction, which also unpins the blocks */
4105         ret = btrfs_commit_transaction(trans, fs_info->tree_root);
4106         if (ret)
4107                 return ret;
4108
4109         free_extent_buffer(log_root_tree->node);
4110         log_root_tree->log_root = NULL;
4111         fs_info->log_root_recovering = 0;
4112         kfree(log_root_tree);
4113
4114         return 0;
4115 error:
4116         if (wc.trans)
4117                 btrfs_end_transaction(wc.trans, fs_info->tree_root);
4118         btrfs_free_path(path);
4119         return ret;
4120 }
4121
4122 /*
4123  * there are some corner cases where we want to force a full
4124  * commit instead of allowing a directory to be logged.
4125  *
4126  * They revolve around files there were unlinked from the directory, and
4127  * this function updates the parent directory so that a full commit is
4128  * properly done if it is fsync'd later after the unlinks are done.
4129  */
4130 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
4131                              struct inode *dir, struct inode *inode,
4132                              int for_rename)
4133 {
4134         /*
4135          * when we're logging a file, if it hasn't been renamed
4136          * or unlinked, and its inode is fully committed on disk,
4137          * we don't have to worry about walking up the directory chain
4138          * to log its parents.
4139          *
4140          * So, we use the last_unlink_trans field to put this transid
4141          * into the file.  When the file is logged we check it and
4142          * don't log the parents if the file is fully on disk.
4143          */
4144         if (S_ISREG(inode->i_mode))
4145                 BTRFS_I(inode)->last_unlink_trans = trans->transid;
4146
4147         /*
4148          * if this directory was already logged any new
4149          * names for this file/dir will get recorded
4150          */
4151         smp_mb();
4152         if (BTRFS_I(dir)->logged_trans == trans->transid)
4153                 return;
4154
4155         /*
4156          * if the inode we're about to unlink was logged,
4157          * the log will be properly updated for any new names
4158          */
4159         if (BTRFS_I(inode)->logged_trans == trans->transid)
4160                 return;
4161
4162         /*
4163          * when renaming files across directories, if the directory
4164          * there we're unlinking from gets fsync'd later on, there's
4165          * no way to find the destination directory later and fsync it
4166          * properly.  So, we have to be conservative and force commits
4167          * so the new name gets discovered.
4168          */
4169         if (for_rename)
4170                 goto record;
4171
4172         /* we can safely do the unlink without any special recording */
4173         return;
4174
4175 record:
4176         BTRFS_I(dir)->last_unlink_trans = trans->transid;
4177 }
4178
4179 /*
4180  * Call this after adding a new name for a file and it will properly
4181  * update the log to reflect the new name.
4182  *
4183  * It will return zero if all goes well, and it will return 1 if a
4184  * full transaction commit is required.
4185  */
4186 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4187                         struct inode *inode, struct inode *old_dir,
4188                         struct dentry *parent)
4189 {
4190         struct btrfs_root * root = BTRFS_I(inode)->root;
4191
4192         /*
4193          * this will force the logging code to walk the dentry chain
4194          * up for the file
4195          */
4196         if (S_ISREG(inode->i_mode))
4197                 BTRFS_I(inode)->last_unlink_trans = trans->transid;
4198
4199         /*
4200          * if this inode hasn't been logged and directory we're renaming it
4201          * from hasn't been logged, we don't need to log it
4202          */
4203         if (BTRFS_I(inode)->logged_trans <=
4204             root->fs_info->last_trans_committed &&
4205             (!old_dir || BTRFS_I(old_dir)->logged_trans <=
4206                     root->fs_info->last_trans_committed))
4207                 return 0;
4208
4209         return btrfs_log_inode_parent(trans, root, inode, parent, 1);
4210 }
4211