Merge tag 'drm-intel-next-fixes-2015-11-06' of git://anongit.freedesktop.org/drm...

[karo-tx-linux.git] / fs / btrfs / tree-log.c
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index 9c45431e69aba3a7a8e2d0bcfb8c3b4a18182df2..1bbaace733838e6fc70dba1babafb6f9265c00f9 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -140,55 +140,46 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            struct btrfs_log_ctx *ctx)
  {
-       int index;
-       int ret;
+       int ret = 0;
  
         mutex_lock(&root->log_mutex);
+
         if (root->log_root) {
                 if (btrfs_need_log_full_commit(root->fs_info, trans)) {
                         ret = -EAGAIN;
                         goto out;
                 }
+
                 if (!root->log_start_pid) {
-                       root->log_start_pid = current->pid;
                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+                       root->log_start_pid = current->pid;
                 } else if (root->log_start_pid != current->pid) {
                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
                 }
+       } else {
+               mutex_lock(&root->fs_info->tree_log_mutex);
+               if (!root->fs_info->log_root_tree)
+                       ret = btrfs_init_log_root_tree(trans, root->fs_info);
+               mutex_unlock(&root->fs_info->tree_log_mutex);
+               if (ret)
+                       goto out;
  
-               atomic_inc(&root->log_batch);
-               atomic_inc(&root->log_writers);
-               if (ctx) {
-                       index = root->log_transid % 2;
-                       list_add_tail(&ctx->list, &root->log_ctxs[index]);
-                       ctx->log_transid = root->log_transid;
-               }
-               mutex_unlock(&root->log_mutex);
-               return 0;
-       }
-
-       ret = 0;
-       mutex_lock(&root->fs_info->tree_log_mutex);
-       if (!root->fs_info->log_root_tree)
-               ret = btrfs_init_log_root_tree(trans, root->fs_info);
-       mutex_unlock(&root->fs_info->tree_log_mutex);
-       if (ret)
-               goto out;
-
-       if (!root->log_root) {
                 ret = btrfs_add_log_tree(trans, root);
                 if (ret)
                         goto out;
+
+               clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+               root->log_start_pid = current->pid;
         }
-       clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
-       root->log_start_pid = current->pid;
+
         atomic_inc(&root->log_batch);
         atomic_inc(&root->log_writers);
         if (ctx) {
-               index = root->log_transid % 2;
+               int index = root->log_transid % 2;
                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
                 ctx->log_transid = root->log_transid;
         }
+
  out:
         mutex_unlock(&root->log_mutex);
         return ret;
@@ -731,11 +722,65 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                                 &ordered_sums, 0);
                         if (ret)
                                 goto out;
+                       /*
+                        * Now delete all existing cums in the csum root that
+                        * cover our range. We do this because we can have an
+                        * extent that is completely referenced by one file
+                        * extent item and partially referenced by another
+                        * file extent item (like after using the clone or
+                        * extent_same ioctls). In this case if we end up doing
+                        * the replay of the one that partially references the
+                        * extent first, and we do not do the csum deletion
+                        * below, we can get 2 csum items in the csum tree that
+                        * overlap each other. For example, imagine our log has
+                        * the two following file extent items:
+                        *
+                        * key (257 EXTENT_DATA 409600)
+                        *     extent data disk byte 12845056 nr 102400
+                        *     extent data offset 20480 nr 20480 ram 102400
+                        *
+                        * key (257 EXTENT_DATA 819200)
+                        *     extent data disk byte 12845056 nr 102400
+                        *     extent data offset 0 nr 102400 ram 102400
+                        *
+                        * Where the second one fully references the 100K extent
+                        * that starts at disk byte 12845056, and the log tree
+                        * has a single csum item that covers the entire range
+                        * of the extent:
+                        *
+                        * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+                        *
+                        * After the first file extent item is replayed, the
+                        * csum tree gets the following csum item:
+                        *
+                        * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+                        *
+                        * Which covers the 20K sub-range starting at offset 20K
+                        * of our extent. Now when we replay the second file
+                        * extent item, if we do not delete existing csum items
+                        * that cover any of its blocks, we end up getting two
+                        * csum items in our csum tree that overlap each other:
+                        *
+                        * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+                        * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+                        *
+                        * Which is a problem, because after this anyone trying
+                        * to lookup up for the checksum of any block of our
+                        * extent starting at an offset of 40K or higher, will
+                        * end up looking at the second csum item only, which
+                        * does not contain the checksum for any block starting
+                        * at offset 40K or higher of our extent.
+                        */
                         while (!list_empty(&ordered_sums)) {
                                 struct btrfs_ordered_sum *sums;
                                 sums = list_entry(ordered_sums.next,
                                                 struct btrfs_ordered_sum,
                                                 list);
+                               if (!ret)
+                                       ret = btrfs_del_csums(trans,
+                                                     root->fs_info->csum_root,
+                                                     sums->bytenr,
+                                                     sums->len);
                                 if (!ret)
                                         ret = btrfs_csum_file_blocks(trans,
                                                 root->fs_info->csum_root,
@@ -1549,9 +1594,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
   */
  static noinline int insert_one_name(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
-                                   struct btrfs_path *path,
                                     u64 dirid, u64 index,
-                                   char *name, int name_len, u8 type,
+                                   char *name, int name_len,
                                     struct btrfs_key *location)
  {
         struct inode *inode;
@@ -1613,6 +1657,9 @@ static bool name_in_log_ref(struct btrfs_root *log_root,
   * not exist in the FS, it is skipped.  fsyncs on directories
   * do not force down inodes inside that directory, just changes to the
   * names or unlinks in a directory.
+ *
+ * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
+ * non-existing inode) and 1 if the name was replayed.
   */
  static noinline int replay_one_name(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
@@ -1631,6 +1678,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
         int exists;
         int ret = 0;
         bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+       bool name_added = false;
  
         dir = read_one_inode(root, key->objectid);
         if (!dir)
@@ -1708,6 +1756,8 @@ out:
         }
         kfree(name);
         iput(dir);
+       if (!ret && name_added)
+               ret = 1;
         return ret;
  
  insert:
@@ -1719,10 +1769,12 @@ insert:
                 goto out;
         }
         btrfs_release_path(path);
-       ret = insert_one_name(trans, root, path, key->objectid, key->offset,
-                             name, name_len, log_type, &log_key);
+       ret = insert_one_name(trans, root, key->objectid, key->offset,
+                             name, name_len, &log_key);
         if (ret && ret != -ENOENT && ret != -EEXIST)
                 goto out;
+       if (!ret)
+               name_added = true;
         update_size = false;
         ret = 0;
         goto out;
@@ -1740,12 +1792,13 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
                                         struct extent_buffer *eb, int slot,
                                         struct btrfs_key *key)
  {
-       int ret;
+       int ret = 0;
         u32 item_size = btrfs_item_size_nr(eb, slot);
         struct btrfs_dir_item *di;
         int name_len;
         unsigned long ptr;
         unsigned long ptr_end;
+       struct btrfs_path *fixup_path = NULL;
  
         ptr = btrfs_item_ptr_offset(eb, slot);
         ptr_end = ptr + item_size;
@@ -1755,12 +1808,59 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
                         return -EIO;
                 name_len = btrfs_dir_name_len(eb, di);
                 ret = replay_one_name(trans, root, path, eb, di, key);
-               if (ret)
-                       return ret;
+               if (ret < 0)
+                       break;
                 ptr = (unsigned long)(di + 1);
                 ptr += name_len;
+
+               /*
+                * If this entry refers to a non-directory (directories can not
+                * have a link count > 1) and it was added in the transaction
+                * that was not committed, make sure we fixup the link count of
+                * the inode it the entry points to. Otherwise something like
+                * the following would result in a directory pointing to an
+                * inode with a wrong link that does not account for this dir
+                * entry:
+                *
+                * mkdir testdir
+                * touch testdir/foo
+                * touch testdir/bar
+                * sync
+                *
+                * ln testdir/bar testdir/bar_link
+                * ln testdir/foo testdir/foo_link
+                * xfs_io -c "fsync" testdir/bar
+                *
+                * <power failure>
+                *
+                * mount fs, log replay happens
+                *
+                * File foo would remain with a link count of 1 when it has two
+                * entries pointing to it in the directory testdir. This would
+                * make it impossible to ever delete the parent directory has
+                * it would result in stale dentries that can never be deleted.
+                */
+               if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+                       struct btrfs_key di_key;
+
+                       if (!fixup_path) {
+                               fixup_path = btrfs_alloc_path();
+                               if (!fixup_path) {
+                                       ret = -ENOMEM;
+                                       break;
+                               }
+                       }
+
+                       btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+                       ret = link_to_fixup_dir(trans, root, fixup_path,
+                                               di_key.objectid);
+                       if (ret)
+                               break;
+               }
+               ret = 0;
         }
-       return 0;
+       btrfs_free_path(fixup_path);
+       return ret;
  }
  
  /*
@@ -2535,8 +2635,7 @@ static int update_log_root(struct btrfs_trans_handle *trans,
         return ret;
  }
  
-static void wait_log_commit(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root, int transid)
+static void wait_log_commit(struct btrfs_root *root, int transid)
  {
         DEFINE_WAIT(wait);
         int index = transid % 2;
@@ -2561,8 +2660,7 @@ static void wait_log_commit(struct btrfs_trans_handle *trans,
                  atomic_read(&root->log_commit[index]));
  }
  
-static void wait_for_writer(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root)
+static void wait_for_writer(struct btrfs_root *root)
  {
         DEFINE_WAIT(wait);
  
@@ -2642,7 +2740,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
  
         index1 = log_transid % 2;
         if (atomic_read(&root->log_commit[index1])) {
-               wait_log_commit(trans, root, log_transid);
+               wait_log_commit(root, log_transid);
                 mutex_unlock(&root->log_mutex);
                 return ctx->log_ret;
         }
@@ -2651,7 +2749,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
  
         /* wait for previous tree log sync to complete */
         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-               wait_log_commit(trans, root, log_transid - 1);
+               wait_log_commit(root, log_transid - 1);
  
         while (1) {
                 int batch = atomic_read(&root->log_batch);
@@ -2662,7 +2760,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                         schedule_timeout_uninterruptible(1);
                         mutex_lock(&root->log_mutex);
                 }
-               wait_for_writer(trans, root);
+               wait_for_writer(root);
                 if (batch == atomic_read(&root->log_batch))
                         break;
         }
@@ -2759,7 +2857,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
                                                 mark);
                 btrfs_wait_logged_extents(trans, log, log_transid);
-               wait_log_commit(trans, log_root_tree,
+               wait_log_commit(log_root_tree,
                                 root_log_ctx.log_transid);
                 mutex_unlock(&log_root_tree->log_mutex);
                 if (!ret)
@@ -2770,11 +2868,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         atomic_set(&log_root_tree->log_commit[index2], 1);
  
         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
-               wait_log_commit(trans, log_root_tree,
+               wait_log_commit(log_root_tree,
                                 root_log_ctx.log_transid - 1);
         }
  
-       wait_for_writer(trans, log_root_tree);
+       wait_for_writer(log_root_tree);
  
         /*
          * now that we've moved on to the tree of log tree roots,
@@ -4904,6 +5002,94 @@ next_dir_inode:
         return ret;
  }
  
+static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
+                                struct inode *inode,
+                                struct btrfs_log_ctx *ctx)
+{
+       int ret;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       const u64 ino = btrfs_ino(inode);
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       path->skip_locking = 1;
+       path->search_commit_root = 1;
+
+       key.objectid = ino;
+       key.type = BTRFS_INODE_REF_KEY;
+       key.offset = 0;
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+
+       while (true) {
+               struct extent_buffer *leaf = path->nodes[0];
+               int slot = path->slots[0];
+               u32 cur_offset = 0;
+               u32 item_size;
+               unsigned long ptr;
+
+               if (slot >= btrfs_header_nritems(leaf)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0)
+                               goto out;
+                       else if (ret > 0)
+                               break;
+                       continue;
+               }
+
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+               /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
+               if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
+                       break;
+
+               item_size = btrfs_item_size_nr(leaf, slot);
+               ptr = btrfs_item_ptr_offset(leaf, slot);
+               while (cur_offset < item_size) {
+                       struct btrfs_key inode_key;
+                       struct inode *dir_inode;
+
+                       inode_key.type = BTRFS_INODE_ITEM_KEY;
+                       inode_key.offset = 0;
+
+                       if (key.type == BTRFS_INODE_EXTREF_KEY) {
+                               struct btrfs_inode_extref *extref;
+
+                               extref = (struct btrfs_inode_extref *)
+                                       (ptr + cur_offset);
+                               inode_key.objectid = btrfs_inode_extref_parent(
+                                       leaf, extref);
+                               cur_offset += sizeof(*extref);
+                               cur_offset += btrfs_inode_extref_name_len(leaf,
+                                       extref);
+                       } else {
+                               inode_key.objectid = key.offset;
+                               cur_offset = item_size;
+                       }
+
+                       dir_inode = btrfs_iget(root->fs_info->sb, &inode_key,
+                                              root, NULL);
+                       /* If parent inode was deleted, skip it. */
+                       if (IS_ERR(dir_inode))
+                               continue;
+
+                       ret = btrfs_log_inode(trans, root, dir_inode,
+                                             LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+                       iput(dir_inode);
+                       if (ret)
+                               goto out;
+               }
+               path->slots[0]++;
+       }
+       ret = 0;
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
  /*
   * helper function around btrfs_log_inode to make sure newly created
   * parent directories also end up in the log.  A minimal inode and backref
@@ -4923,9 +5109,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
         struct dentry *old_parent = NULL;
         int ret = 0;
         u64 last_committed = root->fs_info->last_trans_committed;
-       const struct dentry * const first_parent = parent;
-       const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
-                                last_committed);
         bool log_dentries = false;
         struct inode *orig_inode = inode;
  
@@ -4986,6 +5169,53 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
         if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
                 log_dentries = true;
  
+       /*
+        * On unlink we must make sure all our current and old parent directores
+        * inodes are fully logged. This is to prevent leaving dangling
+        * directory index entries in directories that were our parents but are
+        * not anymore. Not doing this results in old parent directory being
+        * impossible to delete after log replay (rmdir will always fail with
+        * error -ENOTEMPTY).
+        *
+        * Example 1:
+        *
+        * mkdir testdir
+        * touch testdir/foo
+        * ln testdir/foo testdir/bar
+        * sync
+        * unlink testdir/bar
+        * xfs_io -c fsync testdir/foo
+        * <power failure>
+        * mount fs, triggers log replay
+        *
+        * If we don't log the parent directory (testdir), after log replay the
+        * directory still has an entry pointing to the file inode using the bar
+        * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
+        * the file inode has a link count of 1.
+        *
+        * Example 2:
+        *
+        * mkdir testdir
+        * touch foo
+        * ln foo testdir/foo2
+        * ln foo testdir/foo3
+        * sync
+        * unlink testdir/foo3
+        * xfs_io -c fsync foo
+        * <power failure>
+        * mount fs, triggers log replay
+        *
+        * Similar as the first example, after log replay the parent directory
+        * testdir still has an entry pointing to the inode file with name foo3
+        * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
+        * and has a link count of 2.
+        */
+       if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+               ret = btrfs_log_all_parents(trans, orig_inode, ctx);
+               if (ret)
+                       goto end_trans;
+       }
+
         while (1) {
                 if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
                         break;
@@ -4994,23 +5224,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                 if (root != BTRFS_I(inode)->root)
                         break;
  
-               /*
-                * On unlink we must make sure our immediate parent directory
-                * inode is fully logged. This is to prevent leaving dangling
-                * directory index entries and a wrong directory inode's i_size.
-                * Not doing so can result in a directory being impossible to
-                * delete after log replay (rmdir will always fail with error
-                * -ENOTEMPTY).
-                */
-               if (did_unlink && parent == first_parent)
-                       inode_only = LOG_INODE_ALL;
-               else
-                       inode_only = LOG_INODE_EXISTS;
-
-               if (BTRFS_I(inode)->generation >
-                   root->fs_info->last_trans_committed ||
-                   inode_only == LOG_INODE_ALL) {
-                       ret = btrfs_log_inode(trans, root, inode, inode_only,
+               if (BTRFS_I(inode)->generation > last_committed) {
+                       ret = btrfs_log_inode(trans, root, inode,
+                                             LOG_INODE_EXISTS,
                                               0, LLONG_MAX, ctx);
                         if (ret)
                                 goto end_trans;