]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
xfs: convert buffer cache hash to rbtree
authorDave Chinner <dchinner@redhat.com>
Fri, 24 Sep 2010 09:59:04 +0000 (19:59 +1000)
committerAlex Elder <aelder@sgi.com>
Mon, 18 Oct 2010 20:07:56 +0000 (15:07 -0500)
The buffer cache hash is showing typical hash scalability problems.
In large scale testing the number of cached items growing far larger
than the hash can efficiently handle. Hence we need to move to a
self-scaling cache indexing mechanism.

I have selected rbtrees for indexing becuse they can have O(log n)
search scalability, and insert and remove cost is not excessive,
even on large trees. Hence we should be able to cache large numbers
of buffers without incurring the excessive cache miss search
penalties that the hash is imposing on us.

To ensure we still have parallel access to the cache, we need
multiple trees. Rather than hashing the buffers by disk address to
select a tree, it seems more sensible to separate trees by typical
access patterns. Most operations use buffers from within a single AG
at a time, so rather than searching lots of different lists,
separate the buffer indexes out into per-AG rbtrees. This means that
searches during metadata operation have a much higher chance of
hitting cache resident nodes, and that updates of the tree are less
likely to disturb trees being accessed on other CPUs doing
independent operations.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
fs/xfs/linux-2.6/xfs_buf.c
fs/xfs/linux-2.6/xfs_buf.h
fs/xfs/xfs_ag.h
fs/xfs/xfs_mount.c

index 251bcdc6352ed1216c7f95bf9ef4256afd676607..749d7d39d6577e749bb2bf6198895cfd737b9a16 100644 (file)
@@ -188,7 +188,7 @@ _xfs_buf_initialize(
        atomic_set(&bp->b_hold, 1);
        init_completion(&bp->b_iowait);
        INIT_LIST_HEAD(&bp->b_list);
-       INIT_LIST_HEAD(&bp->b_hash_list);
+       RB_CLEAR_NODE(&bp->b_rbnode);
        init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
        XB_SET_OWNER(bp);
        bp->b_target = target;
@@ -262,8 +262,6 @@ xfs_buf_free(
 {
        trace_xfs_buf_free(bp, _RET_IP_);
 
-       ASSERT(list_empty(&bp->b_hash_list));
-
        if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
                uint            i;
 
@@ -422,8 +420,10 @@ _xfs_buf_find(
 {
        xfs_off_t               range_base;
        size_t                  range_length;
-       xfs_bufhash_t           *hash;
-       xfs_buf_t               *bp, *n;
+       struct xfs_perag        *pag;
+       struct rb_node          **rbp;
+       struct rb_node          *parent;
+       xfs_buf_t               *bp;
 
        range_base = (ioff << BBSHIFT);
        range_length = (isize << BBSHIFT);
@@ -432,14 +432,37 @@ _xfs_buf_find(
        ASSERT(!(range_length < (1 << btp->bt_sshift)));
        ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
 
-       hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
-
-       spin_lock(&hash->bh_lock);
-
-       list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
-               ASSERT(btp == bp->b_target);
-               if (bp->b_file_offset == range_base &&
-                   bp->b_buffer_length == range_length) {
+       /* get tree root */
+       pag = xfs_perag_get(btp->bt_mount,
+                               xfs_daddr_to_agno(btp->bt_mount, ioff));
+
+       /* walk tree */
+       spin_lock(&pag->pag_buf_lock);
+       rbp = &pag->pag_buf_tree.rb_node;
+       parent = NULL;
+       bp = NULL;
+       while (*rbp) {
+               parent = *rbp;
+               bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+
+               if (range_base < bp->b_file_offset)
+                       rbp = &(*rbp)->rb_left;
+               else if (range_base > bp->b_file_offset)
+                       rbp = &(*rbp)->rb_right;
+               else {
+                       /*
+                        * found a block offset match. If the range doesn't
+                        * match, the only way this is allowed is if the buffer
+                        * in the cache is stale and the transaction that made
+                        * it stale has not yet committed. i.e. we are
+                        * reallocating a busy extent. Skip this buffer and
+                        * continue searching to the right for an exact match.
+                        */
+                       if (bp->b_buffer_length != range_length) {
+                               ASSERT(bp->b_flags & XBF_STALE);
+                               rbp = &(*rbp)->rb_right;
+                               continue;
+                       }
                        atomic_inc(&bp->b_hold);
                        goto found;
                }
@@ -449,17 +472,21 @@ _xfs_buf_find(
        if (new_bp) {
                _xfs_buf_initialize(new_bp, btp, range_base,
                                range_length, flags);
-               new_bp->b_hash = hash;
-               list_add(&new_bp->b_hash_list, &hash->bh_list);
+               rb_link_node(&new_bp->b_rbnode, parent, rbp);
+               rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+               /* the buffer keeps the perag reference until it is freed */
+               new_bp->b_pag = pag;
+               spin_unlock(&pag->pag_buf_lock);
        } else {
                XFS_STATS_INC(xb_miss_locked);
+               spin_unlock(&pag->pag_buf_lock);
+               xfs_perag_put(pag);
        }
-
-       spin_unlock(&hash->bh_lock);
        return new_bp;
 
 found:
-       spin_unlock(&hash->bh_lock);
+       spin_unlock(&pag->pag_buf_lock);
+       xfs_perag_put(pag);
 
        /* Attempt to get the semaphore without sleeping,
         * if this does not work then we need to drop the
@@ -809,27 +836,30 @@ void
 xfs_buf_rele(
        xfs_buf_t               *bp)
 {
-       xfs_bufhash_t           *hash = bp->b_hash;
+       struct xfs_perag        *pag = bp->b_pag;
 
        trace_xfs_buf_rele(bp, _RET_IP_);
 
-       if (unlikely(!hash)) {
+       if (!pag) {
                ASSERT(!bp->b_relse);
+               ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                if (atomic_dec_and_test(&bp->b_hold))
                        xfs_buf_free(bp);
                return;
        }
 
+       ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
        ASSERT(atomic_read(&bp->b_hold) > 0);
-       if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
+       if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
                if (bp->b_relse) {
                        atomic_inc(&bp->b_hold);
-                       spin_unlock(&hash->bh_lock);
-                       (*(bp->b_relse)) (bp);
+                       spin_unlock(&pag->pag_buf_lock);
+                       bp->b_relse(bp);
                } else {
                        ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
-                       list_del_init(&bp->b_hash_list);
-                       spin_unlock(&hash->bh_lock);
+                       rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+                       spin_unlock(&pag->pag_buf_lock);
+                       xfs_perag_put(pag);
                        xfs_buf_free(bp);
                }
        }
@@ -1429,56 +1459,24 @@ xfs_buf_iomove(
  */
 void
 xfs_wait_buftarg(
-       xfs_buftarg_t   *btp)
+       struct xfs_buftarg      *btp)
 {
-       xfs_bufhash_t   *hash;
-       uint            i;
+       struct xfs_perag        *pag;
+       uint                    i;
 
-       for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-               hash = &btp->bt_hash[i];
-               spin_lock(&hash->bh_lock);
-               while (!list_empty(&hash->bh_list)) {
-                       spin_unlock(&hash->bh_lock);
+       for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
+               pag = xfs_perag_get(btp->bt_mount, i);
+               spin_lock(&pag->pag_buf_lock);
+               while (rb_first(&pag->pag_buf_tree)) {
+                       spin_unlock(&pag->pag_buf_lock);
                        delay(100);
-                       spin_lock(&hash->bh_lock);
+                       spin_lock(&pag->pag_buf_lock);
                }
-               spin_unlock(&hash->bh_lock);
-       }
-}
-
-/*
- *     Allocate buffer hash table for a given target.
- *     For devices containing metadata (i.e. not the log/realtime devices)
- *     we need to allocate a much larger hash table.
- */
-STATIC void
-xfs_alloc_bufhash(
-       xfs_buftarg_t           *btp,
-       int                     external)
-{
-       unsigned int            i;
-
-       if (external) {
-               btp->bt_hash = NULL;
-               return;
-       }
-       btp->bt_hashshift = 12; /* 4096 buckets */
-       btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
-                                        sizeof(xfs_bufhash_t));
-       for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-               spin_lock_init(&btp->bt_hash[i].bh_lock);
-               INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
+               spin_unlock(&pag->pag_buf_lock);
+               xfs_perag_put(pag);
        }
 }
 
-STATIC void
-xfs_free_bufhash(
-       xfs_buftarg_t           *btp)
-{
-       kmem_free_large(btp->bt_hash);
-       btp->bt_hash = NULL;
-}
-
 /*
  *     buftarg list for delwrite queue processing
  */
@@ -1511,7 +1509,6 @@ xfs_free_buftarg(
        xfs_flush_buftarg(btp, 1);
        if (mp->m_flags & XFS_MOUNT_BARRIER)
                xfs_blkdev_issue_flush(btp);
-       xfs_free_bufhash(btp);
        iput(btp->bt_mapping->host);
 
        /* Unregister the buftarg first so that we don't get a
@@ -1651,7 +1648,6 @@ xfs_alloc_buftarg(
                goto error;
        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
-       xfs_alloc_bufhash(btp, external);
        return btp;
 
 error:
index 1f109cee136caf36ae66cea2961c8854b5f929f6..c79882ecb513219f80e3ae5287b4cf5d4004c7ea 100644 (file)
@@ -135,10 +135,6 @@ typedef struct xfs_buftarg {
        unsigned int            bt_sshift;
        size_t                  bt_smask;
 
-       /* per device buffer hash table */
-       uint                    bt_hashshift;
-       xfs_bufhash_t           *bt_hash;
-
        /* per device delwri queue */
        struct task_struct      *bt_task;
        struct list_head        bt_list;
@@ -172,8 +168,8 @@ typedef struct xfs_buf {
        wait_queue_head_t       b_waiters;      /* unpin waiters */
        struct list_head        b_list;
        xfs_buf_flags_t         b_flags;        /* status flags */
-       struct list_head        b_hash_list;    /* hash table list */
-       xfs_bufhash_t           *b_hash;        /* hash table list start */
+       struct rb_node          b_rbnode;       /* rbtree node */
+       struct xfs_perag        *b_pag;         /* contains rbtree root */
        xfs_buftarg_t           *b_target;      /* buffer target (device) */
        atomic_t                b_hold;         /* reference count */
        xfs_daddr_t             b_bn;           /* block number for I/O */
index baeec83d01f9ab06bb2280604928b04130b9d40e..63c7a1a6c02210d9483f0adf5035a06d3620dd1b 100644 (file)
@@ -233,6 +233,10 @@ typedef struct xfs_perag {
        struct mutex    pag_ici_reclaim_lock;   /* serialisation point */
        unsigned long   pag_ici_reclaim_cursor; /* reclaim restart point */
 
+       /* buffer cache index */
+       spinlock_t      pag_buf_lock;   /* lock for pag_buf_tree */
+       struct rb_root  pag_buf_tree;   /* ordered tree of active buffers */
+
        /* for rcu-safe freeing */
        struct rcu_head rcu_head;
 #endif
index 59859c343e04b6407600bea4d33d3e7e18fc9766..cfa2fb4e7f978cbc939c0b577f34059dd1f18f13 100644 (file)
@@ -479,6 +479,8 @@ xfs_initialize_perag(
                rwlock_init(&pag->pag_ici_lock);
                mutex_init(&pag->pag_ici_reclaim_lock);
                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+               spin_lock_init(&pag->pag_buf_lock);
+               pag->pag_buf_tree = RB_ROOT;
 
                if (radix_tree_preload(GFP_NOFS))
                        goto out_unwind;