]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Btrfs: reduce CPU contention while waiting for delayed extent operations
authorChris Mason <chris.mason@fusionio.com>
Tue, 29 Jan 2013 23:44:12 +0000 (18:44 -0500)
committerChris Mason <chris.mason@fusionio.com>
Fri, 1 Feb 2013 19:24:25 +0000 (14:24 -0500)
We batch up operations to the extent allocation tree, which allows
us to deal with the recursive nature of using the extent allocation
tree to allocate extents to the extent allocation tree.

It also provides a mechanism to sort and collect extent
operations, which makes it much more efficient to record extents
that are close together.

The delayed extent operations must all be finished before the
running transaction commits, so we have code to make sure and run a few
of the batched operations when closing our transaction handles.

This creates a great deal of contention for the locks in the
delayed extent operation tree, and also contention for the lock on the
extent allocation tree itself.  All the extra contention just slows
down the operations and doesn't get things done any faster.

This commit changes things to use a wait queue instead.  As procs
want to run the delayed operations, one of them races in and gets
permission to hit the tree, and the others step back and wait for
progress to be made.

Signed-off-by: Chris Mason <chris.mason@fusionio.com>
fs/btrfs/delayed-ref.h
fs/btrfs/extent-tree.c
fs/btrfs/transaction.c

index c9d703693df0b91e4c01a9181f93d2ab64572dcf..23bdeb8502a6c7b215419e2f7de83a0258519663 100644 (file)
@@ -131,6 +131,15 @@ struct btrfs_delayed_ref_root {
        /* total number of head nodes ready for processing */
        unsigned long num_heads_ready;
 
+       /*
+        * bumped when someone is making progress on the delayed
+        * refs, so that other procs know they are just adding to
+        * contention intead of helping
+        */
+       atomic_t procs_running_refs;
+       atomic_t ref_seq;
+       wait_queue_head_t wait;
+
        /*
         * set when the tree is flushing before a transaction commit,
         * used by the throttling code to decide if new updates need
index f13402104c96ed268dd4d0c748e55c1ec0df12f1..87b0e856b6d04f88834ade99fe6766a78cc002e8 100644 (file)
@@ -2438,6 +2438,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
        return ret;
 }
 
+static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
+                     int count)
+{
+       int val = atomic_read(&delayed_refs->ref_seq);
+
+       if (val < seq || val >= seq + count)
+               return 1;
+       return 0;
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2472,6 +2482,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 
        delayed_refs = &trans->transaction->delayed_refs;
        INIT_LIST_HEAD(&cluster);
+       if (count == 0) {
+               count = delayed_refs->num_entries * 2;
+               run_most = 1;
+       }
+
+       if (!run_all && !run_most) {
+               int old;
+               int seq = atomic_read(&delayed_refs->ref_seq);
+
+progress:
+               old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+               if (old) {
+                       DEFINE_WAIT(__wait);
+                       if (delayed_refs->num_entries < 16348)
+                               return 0;
+
+                       prepare_to_wait(&delayed_refs->wait, &__wait,
+                                       TASK_UNINTERRUPTIBLE);
+
+                       old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+                       if (old) {
+                               schedule();
+                               finish_wait(&delayed_refs->wait, &__wait);
+
+                               if (!refs_newer(delayed_refs, seq, 256))
+                                       goto progress;
+                               else
+                                       return 0;
+                       } else {
+                               finish_wait(&delayed_refs->wait, &__wait);
+                               goto again;
+                       }
+               }
+
+       } else {
+               atomic_inc(&delayed_refs->procs_running_refs);
+       }
+
 again:
        loops = 0;
        spin_lock(&delayed_refs->lock);
@@ -2480,10 +2528,6 @@ again:
        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
 
-       if (count == 0) {
-               count = delayed_refs->num_entries * 2;
-               run_most = 1;
-       }
        while (1) {
                if (!(run_all || run_most) &&
                    delayed_refs->num_heads_ready < 64)
@@ -2505,9 +2549,12 @@ again:
                if (ret < 0) {
                        spin_unlock(&delayed_refs->lock);
                        btrfs_abort_transaction(trans, root, ret);
+                       atomic_dec(&delayed_refs->procs_running_refs);
                        return ret;
                }
 
+               atomic_add(ret, &delayed_refs->ref_seq);
+
                count -= min_t(unsigned long, ret, count);
 
                if (count == 0)
@@ -2576,6 +2623,11 @@ again:
                goto again;
        }
 out:
+       atomic_dec(&delayed_refs->procs_running_refs);
+       smp_mb();
+       if (waitqueue_active(&delayed_refs->wait))
+               wake_up(&delayed_refs->wait);
+
        spin_unlock(&delayed_refs->lock);
        assert_qgroups_uptodate(trans);
        return 0;
index a065dec0e33099cb71cc80c66345e05769867976..1e7f176bd21106a98f0783da1295839a421eeee8 100644 (file)
@@ -156,6 +156,9 @@ loop:
 
        spin_lock_init(&cur_trans->commit_lock);
        spin_lock_init(&cur_trans->delayed_refs.lock);
+       atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
+       atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
+       init_waitqueue_head(&cur_trans->delayed_refs.wait);
 
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -577,7 +580,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        if (!list_empty(&trans->new_bgs))
                btrfs_create_pending_block_groups(trans, root);
 
-       while (count < 2) {
+       while (count < 1) {
                unsigned long cur = trans->delayed_ref_updates;
                trans->delayed_ref_updates = 0;
                if (cur &&
@@ -589,6 +592,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                }
                count++;
        }
+
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;