From: Chris Mason Date: Tue, 29 Jan 2013 23:44:12 +0000 (-0500) Subject: Btrfs: reduce CPU contention while waiting for delayed extent operations X-Git-Tag: v3.9-rc1~17^2~28^2~3 X-Git-Url: https://git.karo-electronics.de/?a=commitdiff_plain;h=bb721703aa551e98dc5c7fb259cf90343408baf2;p=karo-tx-linux.git Btrfs: reduce CPU contention while waiting for delayed extent operations We batch up operations to the extent allocation tree, which allows us to deal with the recursive nature of using the extent allocation tree to allocate extents to the extent allocation tree. It also provides a mechanism to sort and collect extent operations, which makes it much more efficient to record extents that are close together. The delayed extent operations must all be finished before the running transaction commits, so we have code to make sure and run a few of the batched operations when closing our transaction handles. This creates a great deal of contention for the locks in the delayed extent operation tree, and also contention for the lock on the extent allocation tree itself. All the extra contention just slows down the operations and doesn't get things done any faster. This commit changes things to use a wait queue instead. As procs want to run the delayed operations, one of them races in and gets permission to hit the tree, and the others step back and wait for progress to be made. Signed-off-by: Chris Mason --- diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c9d703693df0..23bdeb8502a6 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -131,6 +131,15 @@ struct btrfs_delayed_ref_root { /* total number of head nodes ready for processing */ unsigned long num_heads_ready; + /* + * bumped when someone is making progress on the delayed + * refs, so that other procs know they are just adding to + * contention intead of helping + */ + atomic_t procs_running_refs; + atomic_t ref_seq; + wait_queue_head_t wait; + /* * set when the tree is flushing before a transaction commit, * used by the throttling code to decide if new updates need diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f13402104c96..87b0e856b6d0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2438,6 +2438,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, return ret; } +static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, + int count) +{ + int val = atomic_read(&delayed_refs->ref_seq); + + if (val < seq || val >= seq + count) + return 1; + return 0; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2472,6 +2482,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); + if (count == 0) { + count = delayed_refs->num_entries * 2; + run_most = 1; + } + + if (!run_all && !run_most) { + int old; + int seq = atomic_read(&delayed_refs->ref_seq); + +progress: + old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); + if (old) { + DEFINE_WAIT(__wait); + if (delayed_refs->num_entries < 16348) + return 0; + + prepare_to_wait(&delayed_refs->wait, &__wait, + TASK_UNINTERRUPTIBLE); + + old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); + if (old) { + schedule(); + finish_wait(&delayed_refs->wait, &__wait); + + if (!refs_newer(delayed_refs, seq, 256)) + goto progress; + else + return 0; + } else { + finish_wait(&delayed_refs->wait, &__wait); + goto again; + } + } + + } else { + atomic_inc(&delayed_refs->procs_running_refs); + } + again: loops = 0; spin_lock(&delayed_refs->lock); @@ -2480,10 +2528,6 @@ again: delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - if (count == 0) { - count = delayed_refs->num_entries * 2; - run_most = 1; - } while (1) { if (!(run_all || run_most) && delayed_refs->num_heads_ready < 64) @@ -2505,9 +2549,12 @@ again: if (ret < 0) { spin_unlock(&delayed_refs->lock); btrfs_abort_transaction(trans, root, ret); + atomic_dec(&delayed_refs->procs_running_refs); return ret; } + atomic_add(ret, &delayed_refs->ref_seq); + count -= min_t(unsigned long, ret, count); if (count == 0) @@ -2576,6 +2623,11 @@ again: goto again; } out: + atomic_dec(&delayed_refs->procs_running_refs); + smp_mb(); + if (waitqueue_active(&delayed_refs->wait)) + wake_up(&delayed_refs->wait); + spin_unlock(&delayed_refs->lock); assert_qgroups_uptodate(trans); return 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a065dec0e330..1e7f176bd211 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -156,6 +156,9 @@ loop: spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); + atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); + atomic_set(&cur_trans->delayed_refs.ref_seq, 0); + init_waitqueue_head(&cur_trans->delayed_refs.wait); INIT_LIST_HEAD(&cur_trans->pending_snapshots); list_add_tail(&cur_trans->list, &fs_info->trans_list); @@ -577,7 +580,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - while (count < 2) { + while (count < 1) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (cur && @@ -589,6 +592,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } count++; } + btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL;