]> git.karo-electronics.de Git - karo-tx-linux.git/blob - fs/xfs/libxfs/xfs_btree.c
c91823c202b6f1fac7469de501ebf3de29586cd1
[karo-tx-linux.git] / fs / xfs / libxfs / xfs_btree.c
1 /*
2  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_shared.h"
21 #include "xfs_format.h"
22 #include "xfs_log_format.h"
23 #include "xfs_trans_resv.h"
24 #include "xfs_bit.h"
25 #include "xfs_mount.h"
26 #include "xfs_defer.h"
27 #include "xfs_inode.h"
28 #include "xfs_trans.h"
29 #include "xfs_inode_item.h"
30 #include "xfs_buf_item.h"
31 #include "xfs_btree.h"
32 #include "xfs_error.h"
33 #include "xfs_trace.h"
34 #include "xfs_cksum.h"
35 #include "xfs_alloc.h"
36 #include "xfs_log.h"
37
38 /*
39  * Cursor allocation zone.
40  */
41 kmem_zone_t     *xfs_btree_cur_zone;
42
43 /*
44  * Btree magic numbers.
45  */
46 static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
47         { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
48           XFS_FIBT_MAGIC, 0 },
49         { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
50           XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
51           XFS_REFC_CRC_MAGIC }
52 };
53 #define xfs_btree_magic(cur) \
54         xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
55
56 STATIC int                              /* error (0 or EFSCORRUPTED) */
57 xfs_btree_check_lblock(
58         struct xfs_btree_cur    *cur,   /* btree cursor */
59         struct xfs_btree_block  *block, /* btree long form block pointer */
60         int                     level,  /* level of the btree block */
61         struct xfs_buf          *bp)    /* buffer for block, if any */
62 {
63         int                     lblock_ok = 1; /* block passes checks */
64         struct xfs_mount        *mp;    /* file system mount point */
65
66         mp = cur->bc_mp;
67
68         if (xfs_sb_version_hascrc(&mp->m_sb)) {
69                 lblock_ok = lblock_ok &&
70                         uuid_equal(&block->bb_u.l.bb_uuid,
71                                    &mp->m_sb.sb_meta_uuid) &&
72                         block->bb_u.l.bb_blkno == cpu_to_be64(
73                                 bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
74         }
75
76         lblock_ok = lblock_ok &&
77                 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
78                 be16_to_cpu(block->bb_level) == level &&
79                 be16_to_cpu(block->bb_numrecs) <=
80                         cur->bc_ops->get_maxrecs(cur, level) &&
81                 block->bb_u.l.bb_leftsib &&
82                 (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
83                  XFS_FSB_SANITY_CHECK(mp,
84                         be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
85                 block->bb_u.l.bb_rightsib &&
86                 (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
87                  XFS_FSB_SANITY_CHECK(mp,
88                         be64_to_cpu(block->bb_u.l.bb_rightsib)));
89
90         if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
91                         XFS_ERRTAG_BTREE_CHECK_LBLOCK,
92                         XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
93                 if (bp)
94                         trace_xfs_btree_corrupt(bp, _RET_IP_);
95                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
96                 return -EFSCORRUPTED;
97         }
98         return 0;
99 }
100
101 STATIC int                              /* error (0 or EFSCORRUPTED) */
102 xfs_btree_check_sblock(
103         struct xfs_btree_cur    *cur,   /* btree cursor */
104         struct xfs_btree_block  *block, /* btree short form block pointer */
105         int                     level,  /* level of the btree block */
106         struct xfs_buf          *bp)    /* buffer containing block */
107 {
108         struct xfs_mount        *mp;    /* file system mount point */
109         struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
110         struct xfs_agf          *agf;   /* ag. freespace structure */
111         xfs_agblock_t           agflen; /* native ag. freespace length */
112         int                     sblock_ok = 1; /* block passes checks */
113
114         mp = cur->bc_mp;
115         agbp = cur->bc_private.a.agbp;
116         agf = XFS_BUF_TO_AGF(agbp);
117         agflen = be32_to_cpu(agf->agf_length);
118
119         if (xfs_sb_version_hascrc(&mp->m_sb)) {
120                 sblock_ok = sblock_ok &&
121                         uuid_equal(&block->bb_u.s.bb_uuid,
122                                    &mp->m_sb.sb_meta_uuid) &&
123                         block->bb_u.s.bb_blkno == cpu_to_be64(
124                                 bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
125         }
126
127         sblock_ok = sblock_ok &&
128                 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
129                 be16_to_cpu(block->bb_level) == level &&
130                 be16_to_cpu(block->bb_numrecs) <=
131                         cur->bc_ops->get_maxrecs(cur, level) &&
132                 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
133                  be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
134                 block->bb_u.s.bb_leftsib &&
135                 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
136                  be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
137                 block->bb_u.s.bb_rightsib;
138
139         if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
140                         XFS_ERRTAG_BTREE_CHECK_SBLOCK,
141                         XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
142                 if (bp)
143                         trace_xfs_btree_corrupt(bp, _RET_IP_);
144                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
145                 return -EFSCORRUPTED;
146         }
147         return 0;
148 }
149
150 /*
151  * Debug routine: check that block header is ok.
152  */
153 int
154 xfs_btree_check_block(
155         struct xfs_btree_cur    *cur,   /* btree cursor */
156         struct xfs_btree_block  *block, /* generic btree block pointer */
157         int                     level,  /* level of the btree block */
158         struct xfs_buf          *bp)    /* buffer containing block, if any */
159 {
160         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
161                 return xfs_btree_check_lblock(cur, block, level, bp);
162         else
163                 return xfs_btree_check_sblock(cur, block, level, bp);
164 }
165
166 /*
167  * Check that (long) pointer is ok.
168  */
169 int                                     /* error (0 or EFSCORRUPTED) */
170 xfs_btree_check_lptr(
171         struct xfs_btree_cur    *cur,   /* btree cursor */
172         xfs_fsblock_t           bno,    /* btree block disk address */
173         int                     level)  /* btree block level */
174 {
175         XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
176                 level > 0 &&
177                 bno != NULLFSBLOCK &&
178                 XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
179         return 0;
180 }
181
182 #ifdef DEBUG
183 /*
184  * Check that (short) pointer is ok.
185  */
186 STATIC int                              /* error (0 or EFSCORRUPTED) */
187 xfs_btree_check_sptr(
188         struct xfs_btree_cur    *cur,   /* btree cursor */
189         xfs_agblock_t           bno,    /* btree block disk address */
190         int                     level)  /* btree block level */
191 {
192         xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
193
194         XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
195                 level > 0 &&
196                 bno != NULLAGBLOCK &&
197                 bno != 0 &&
198                 bno < agblocks);
199         return 0;
200 }
201
202 /*
203  * Check that block ptr is ok.
204  */
205 STATIC int                              /* error (0 or EFSCORRUPTED) */
206 xfs_btree_check_ptr(
207         struct xfs_btree_cur    *cur,   /* btree cursor */
208         union xfs_btree_ptr     *ptr,   /* btree block disk address */
209         int                     index,  /* offset from ptr to check */
210         int                     level)  /* btree block level */
211 {
212         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
213                 return xfs_btree_check_lptr(cur,
214                                 be64_to_cpu((&ptr->l)[index]), level);
215         } else {
216                 return xfs_btree_check_sptr(cur,
217                                 be32_to_cpu((&ptr->s)[index]), level);
218         }
219 }
220 #endif
221
222 /*
223  * Calculate CRC on the whole btree block and stuff it into the
224  * long-form btree header.
225  *
226  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
227  * it into the buffer so recovery knows what the last modification was that made
228  * it to disk.
229  */
230 void
231 xfs_btree_lblock_calc_crc(
232         struct xfs_buf          *bp)
233 {
234         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
235         struct xfs_buf_log_item *bip = bp->b_fspriv;
236
237         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
238                 return;
239         if (bip)
240                 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
241         xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
242 }
243
244 bool
245 xfs_btree_lblock_verify_crc(
246         struct xfs_buf          *bp)
247 {
248         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
249         struct xfs_mount        *mp = bp->b_target->bt_mount;
250
251         if (xfs_sb_version_hascrc(&mp->m_sb)) {
252                 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
253                         return false;
254                 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
255         }
256
257         return true;
258 }
259
260 /*
261  * Calculate CRC on the whole btree block and stuff it into the
262  * short-form btree header.
263  *
264  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
265  * it into the buffer so recovery knows what the last modification was that made
266  * it to disk.
267  */
268 void
269 xfs_btree_sblock_calc_crc(
270         struct xfs_buf          *bp)
271 {
272         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
273         struct xfs_buf_log_item *bip = bp->b_fspriv;
274
275         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
276                 return;
277         if (bip)
278                 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
279         xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
280 }
281
282 bool
283 xfs_btree_sblock_verify_crc(
284         struct xfs_buf          *bp)
285 {
286         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
287         struct xfs_mount        *mp = bp->b_target->bt_mount;
288
289         if (xfs_sb_version_hascrc(&mp->m_sb)) {
290                 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
291                         return false;
292                 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
293         }
294
295         return true;
296 }
297
298 static int
299 xfs_btree_free_block(
300         struct xfs_btree_cur    *cur,
301         struct xfs_buf          *bp)
302 {
303         int                     error;
304
305         error = cur->bc_ops->free_block(cur, bp);
306         if (!error) {
307                 xfs_trans_binval(cur->bc_tp, bp);
308                 XFS_BTREE_STATS_INC(cur, free);
309         }
310         return error;
311 }
312
313 /*
314  * Delete the btree cursor.
315  */
316 void
317 xfs_btree_del_cursor(
318         xfs_btree_cur_t *cur,           /* btree cursor */
319         int             error)          /* del because of error */
320 {
321         int             i;              /* btree level */
322
323         /*
324          * Clear the buffer pointers, and release the buffers.
325          * If we're doing this in the face of an error, we
326          * need to make sure to inspect all of the entries
327          * in the bc_bufs array for buffers to be unlocked.
328          * This is because some of the btree code works from
329          * level n down to 0, and if we get an error along
330          * the way we won't have initialized all the entries
331          * down to 0.
332          */
333         for (i = 0; i < cur->bc_nlevels; i++) {
334                 if (cur->bc_bufs[i])
335                         xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
336                 else if (!error)
337                         break;
338         }
339         /*
340          * Can't free a bmap cursor without having dealt with the
341          * allocated indirect blocks' accounting.
342          */
343         ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
344                cur->bc_private.b.allocated == 0);
345         /*
346          * Free the cursor.
347          */
348         kmem_zone_free(xfs_btree_cur_zone, cur);
349 }
350
351 /*
352  * Duplicate the btree cursor.
353  * Allocate a new one, copy the record, re-get the buffers.
354  */
355 int                                     /* error */
356 xfs_btree_dup_cursor(
357         xfs_btree_cur_t *cur,           /* input cursor */
358         xfs_btree_cur_t **ncur)         /* output cursor */
359 {
360         xfs_buf_t       *bp;            /* btree block's buffer pointer */
361         int             error;          /* error return value */
362         int             i;              /* level number of btree block */
363         xfs_mount_t     *mp;            /* mount structure for filesystem */
364         xfs_btree_cur_t *new;           /* new cursor value */
365         xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
366
367         tp = cur->bc_tp;
368         mp = cur->bc_mp;
369
370         /*
371          * Allocate a new cursor like the old one.
372          */
373         new = cur->bc_ops->dup_cursor(cur);
374
375         /*
376          * Copy the record currently in the cursor.
377          */
378         new->bc_rec = cur->bc_rec;
379
380         /*
381          * For each level current, re-get the buffer and copy the ptr value.
382          */
383         for (i = 0; i < new->bc_nlevels; i++) {
384                 new->bc_ptrs[i] = cur->bc_ptrs[i];
385                 new->bc_ra[i] = cur->bc_ra[i];
386                 bp = cur->bc_bufs[i];
387                 if (bp) {
388                         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
389                                                    XFS_BUF_ADDR(bp), mp->m_bsize,
390                                                    0, &bp,
391                                                    cur->bc_ops->buf_ops);
392                         if (error) {
393                                 xfs_btree_del_cursor(new, error);
394                                 *ncur = NULL;
395                                 return error;
396                         }
397                 }
398                 new->bc_bufs[i] = bp;
399         }
400         *ncur = new;
401         return 0;
402 }
403
404 /*
405  * XFS btree block layout and addressing:
406  *
407  * There are two types of blocks in the btree: leaf and non-leaf blocks.
408  *
409  * The leaf record start with a header then followed by records containing
410  * the values.  A non-leaf block also starts with the same header, and
411  * then first contains lookup keys followed by an equal number of pointers
412  * to the btree blocks at the previous level.
413  *
414  *              +--------+-------+-------+-------+-------+-------+-------+
415  * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
416  *              +--------+-------+-------+-------+-------+-------+-------+
417  *
418  *              +--------+-------+-------+-------+-------+-------+-------+
419  * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
420  *              +--------+-------+-------+-------+-------+-------+-------+
421  *
422  * The header is called struct xfs_btree_block for reasons better left unknown
423  * and comes in different versions for short (32bit) and long (64bit) block
424  * pointers.  The record and key structures are defined by the btree instances
425  * and opaque to the btree core.  The block pointers are simple disk endian
426  * integers, available in a short (32bit) and long (64bit) variant.
427  *
428  * The helpers below calculate the offset of a given record, key or pointer
429  * into a btree block (xfs_btree_*_offset) or return a pointer to the given
430  * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
431  * inside the btree block is done using indices starting at one, not zero!
432  *
433  * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
434  * overlapping intervals.  In such a tree, records are still sorted lowest to
435  * highest and indexed by the smallest key value that refers to the record.
436  * However, nodes are different: each pointer has two associated keys -- one
437  * indexing the lowest key available in the block(s) below (the same behavior
438  * as the key in a regular btree) and another indexing the highest key
439  * available in the block(s) below.  Because records are /not/ sorted by the
440  * highest key, all leaf block updates require us to compute the highest key
441  * that matches any record in the leaf and to recursively update the high keys
442  * in the nodes going further up in the tree, if necessary.  Nodes look like
443  * this:
444  *
445  *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
446  * Non-Leaf:    | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
447  *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
448  *
449  * To perform an interval query on an overlapped tree, perform the usual
450  * depth-first search and use the low and high keys to decide if we can skip
451  * that particular node.  If a leaf node is reached, return the records that
452  * intersect the interval.  Note that an interval query may return numerous
453  * entries.  For a non-overlapped tree, simply search for the record associated
454  * with the lowest key and iterate forward until a non-matching record is
455  * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
456  * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
457  * more detail.
458  *
459  * Why do we care about overlapping intervals?  Let's say you have a bunch of
460  * reverse mapping records on a reflink filesystem:
461  *
462  * 1: +- file A startblock B offset C length D -----------+
463  * 2:      +- file E startblock F offset G length H --------------+
464  * 3:      +- file I startblock F offset J length K --+
465  * 4:                                                        +- file L... --+
466  *
467  * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
468  * we'd simply increment the length of record 1.  But how do we find the record
469  * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
470  * record 3 because the keys are ordered first by startblock.  An interval
471  * query would return records 1 and 2 because they both overlap (B+D-1), and
472  * from that we can pick out record 1 as the appropriate left neighbor.
473  *
474  * In the non-overlapped case you can do a LE lookup and decrement the cursor
475  * because a record's interval must end before the next record.
476  */
477
478 /*
479  * Return size of the btree block header for this btree instance.
480  */
481 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
482 {
483         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
484                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
485                         return XFS_BTREE_LBLOCK_CRC_LEN;
486                 return XFS_BTREE_LBLOCK_LEN;
487         }
488         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
489                 return XFS_BTREE_SBLOCK_CRC_LEN;
490         return XFS_BTREE_SBLOCK_LEN;
491 }
492
493 /*
494  * Return size of btree block pointers for this btree instance.
495  */
496 static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
497 {
498         return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
499                 sizeof(__be64) : sizeof(__be32);
500 }
501
502 /*
503  * Calculate offset of the n-th record in a btree block.
504  */
505 STATIC size_t
506 xfs_btree_rec_offset(
507         struct xfs_btree_cur    *cur,
508         int                     n)
509 {
510         return xfs_btree_block_len(cur) +
511                 (n - 1) * cur->bc_ops->rec_len;
512 }
513
514 /*
515  * Calculate offset of the n-th key in a btree block.
516  */
517 STATIC size_t
518 xfs_btree_key_offset(
519         struct xfs_btree_cur    *cur,
520         int                     n)
521 {
522         return xfs_btree_block_len(cur) +
523                 (n - 1) * cur->bc_ops->key_len;
524 }
525
526 /*
527  * Calculate offset of the n-th high key in a btree block.
528  */
529 STATIC size_t
530 xfs_btree_high_key_offset(
531         struct xfs_btree_cur    *cur,
532         int                     n)
533 {
534         return xfs_btree_block_len(cur) +
535                 (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
536 }
537
538 /*
539  * Calculate offset of the n-th block pointer in a btree block.
540  */
541 STATIC size_t
542 xfs_btree_ptr_offset(
543         struct xfs_btree_cur    *cur,
544         int                     n,
545         int                     level)
546 {
547         return xfs_btree_block_len(cur) +
548                 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
549                 (n - 1) * xfs_btree_ptr_len(cur);
550 }
551
552 /*
553  * Return a pointer to the n-th record in the btree block.
554  */
555 STATIC union xfs_btree_rec *
556 xfs_btree_rec_addr(
557         struct xfs_btree_cur    *cur,
558         int                     n,
559         struct xfs_btree_block  *block)
560 {
561         return (union xfs_btree_rec *)
562                 ((char *)block + xfs_btree_rec_offset(cur, n));
563 }
564
565 /*
566  * Return a pointer to the n-th key in the btree block.
567  */
568 STATIC union xfs_btree_key *
569 xfs_btree_key_addr(
570         struct xfs_btree_cur    *cur,
571         int                     n,
572         struct xfs_btree_block  *block)
573 {
574         return (union xfs_btree_key *)
575                 ((char *)block + xfs_btree_key_offset(cur, n));
576 }
577
578 /*
579  * Return a pointer to the n-th high key in the btree block.
580  */
581 STATIC union xfs_btree_key *
582 xfs_btree_high_key_addr(
583         struct xfs_btree_cur    *cur,
584         int                     n,
585         struct xfs_btree_block  *block)
586 {
587         return (union xfs_btree_key *)
588                 ((char *)block + xfs_btree_high_key_offset(cur, n));
589 }
590
591 /*
592  * Return a pointer to the n-th block pointer in the btree block.
593  */
594 STATIC union xfs_btree_ptr *
595 xfs_btree_ptr_addr(
596         struct xfs_btree_cur    *cur,
597         int                     n,
598         struct xfs_btree_block  *block)
599 {
600         int                     level = xfs_btree_get_level(block);
601
602         ASSERT(block->bb_level != 0);
603
604         return (union xfs_btree_ptr *)
605                 ((char *)block + xfs_btree_ptr_offset(cur, n, level));
606 }
607
608 /*
609  * Get the root block which is stored in the inode.
610  *
611  * For now this btree implementation assumes the btree root is always
612  * stored in the if_broot field of an inode fork.
613  */
614 STATIC struct xfs_btree_block *
615 xfs_btree_get_iroot(
616         struct xfs_btree_cur    *cur)
617 {
618         struct xfs_ifork        *ifp;
619
620         ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
621         return (struct xfs_btree_block *)ifp->if_broot;
622 }
623
624 /*
625  * Retrieve the block pointer from the cursor at the given level.
626  * This may be an inode btree root or from a buffer.
627  */
628 STATIC struct xfs_btree_block *         /* generic btree block pointer */
629 xfs_btree_get_block(
630         struct xfs_btree_cur    *cur,   /* btree cursor */
631         int                     level,  /* level in btree */
632         struct xfs_buf          **bpp)  /* buffer containing the block */
633 {
634         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
635             (level == cur->bc_nlevels - 1)) {
636                 *bpp = NULL;
637                 return xfs_btree_get_iroot(cur);
638         }
639
640         *bpp = cur->bc_bufs[level];
641         return XFS_BUF_TO_BLOCK(*bpp);
642 }
643
644 /*
645  * Get a buffer for the block, return it with no data read.
646  * Long-form addressing.
647  */
648 xfs_buf_t *                             /* buffer for fsbno */
649 xfs_btree_get_bufl(
650         xfs_mount_t     *mp,            /* file system mount point */
651         xfs_trans_t     *tp,            /* transaction pointer */
652         xfs_fsblock_t   fsbno,          /* file system block number */
653         uint            lock)           /* lock flags for get_buf */
654 {
655         xfs_daddr_t             d;              /* real disk block address */
656
657         ASSERT(fsbno != NULLFSBLOCK);
658         d = XFS_FSB_TO_DADDR(mp, fsbno);
659         return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
660 }
661
662 /*
663  * Get a buffer for the block, return it with no data read.
664  * Short-form addressing.
665  */
666 xfs_buf_t *                             /* buffer for agno/agbno */
667 xfs_btree_get_bufs(
668         xfs_mount_t     *mp,            /* file system mount point */
669         xfs_trans_t     *tp,            /* transaction pointer */
670         xfs_agnumber_t  agno,           /* allocation group number */
671         xfs_agblock_t   agbno,          /* allocation group block number */
672         uint            lock)           /* lock flags for get_buf */
673 {
674         xfs_daddr_t             d;              /* real disk block address */
675
676         ASSERT(agno != NULLAGNUMBER);
677         ASSERT(agbno != NULLAGBLOCK);
678         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
679         return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
680 }
681
682 /*
683  * Check for the cursor referring to the last block at the given level.
684  */
685 int                                     /* 1=is last block, 0=not last block */
686 xfs_btree_islastblock(
687         xfs_btree_cur_t         *cur,   /* btree cursor */
688         int                     level)  /* level to check */
689 {
690         struct xfs_btree_block  *block; /* generic btree block pointer */
691         xfs_buf_t               *bp;    /* buffer containing block */
692
693         block = xfs_btree_get_block(cur, level, &bp);
694         xfs_btree_check_block(cur, block, level, bp);
695         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
696                 return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
697         else
698                 return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
699 }
700
701 /*
702  * Change the cursor to point to the first record at the given level.
703  * Other levels are unaffected.
704  */
705 STATIC int                              /* success=1, failure=0 */
706 xfs_btree_firstrec(
707         xfs_btree_cur_t         *cur,   /* btree cursor */
708         int                     level)  /* level to change */
709 {
710         struct xfs_btree_block  *block; /* generic btree block pointer */
711         xfs_buf_t               *bp;    /* buffer containing block */
712
713         /*
714          * Get the block pointer for this level.
715          */
716         block = xfs_btree_get_block(cur, level, &bp);
717         xfs_btree_check_block(cur, block, level, bp);
718         /*
719          * It's empty, there is no such record.
720          */
721         if (!block->bb_numrecs)
722                 return 0;
723         /*
724          * Set the ptr value to 1, that's the first record/key.
725          */
726         cur->bc_ptrs[level] = 1;
727         return 1;
728 }
729
730 /*
731  * Change the cursor to point to the last record in the current block
732  * at the given level.  Other levels are unaffected.
733  */
734 STATIC int                              /* success=1, failure=0 */
735 xfs_btree_lastrec(
736         xfs_btree_cur_t         *cur,   /* btree cursor */
737         int                     level)  /* level to change */
738 {
739         struct xfs_btree_block  *block; /* generic btree block pointer */
740         xfs_buf_t               *bp;    /* buffer containing block */
741
742         /*
743          * Get the block pointer for this level.
744          */
745         block = xfs_btree_get_block(cur, level, &bp);
746         xfs_btree_check_block(cur, block, level, bp);
747         /*
748          * It's empty, there is no such record.
749          */
750         if (!block->bb_numrecs)
751                 return 0;
752         /*
753          * Set the ptr value to numrecs, that's the last record/key.
754          */
755         cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
756         return 1;
757 }
758
759 /*
760  * Compute first and last byte offsets for the fields given.
761  * Interprets the offsets table, which contains struct field offsets.
762  */
763 void
764 xfs_btree_offsets(
765         __int64_t       fields,         /* bitmask of fields */
766         const short     *offsets,       /* table of field offsets */
767         int             nbits,          /* number of bits to inspect */
768         int             *first,         /* output: first byte offset */
769         int             *last)          /* output: last byte offset */
770 {
771         int             i;              /* current bit number */
772         __int64_t       imask;          /* mask for current bit number */
773
774         ASSERT(fields != 0);
775         /*
776          * Find the lowest bit, so the first byte offset.
777          */
778         for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
779                 if (imask & fields) {
780                         *first = offsets[i];
781                         break;
782                 }
783         }
784         /*
785          * Find the highest bit, so the last byte offset.
786          */
787         for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
788                 if (imask & fields) {
789                         *last = offsets[i + 1] - 1;
790                         break;
791                 }
792         }
793 }
794
795 /*
796  * Get a buffer for the block, return it read in.
797  * Long-form addressing.
798  */
799 int
800 xfs_btree_read_bufl(
801         struct xfs_mount        *mp,            /* file system mount point */
802         struct xfs_trans        *tp,            /* transaction pointer */
803         xfs_fsblock_t           fsbno,          /* file system block number */
804         uint                    lock,           /* lock flags for read_buf */
805         struct xfs_buf          **bpp,          /* buffer for fsbno */
806         int                     refval,         /* ref count value for buffer */
807         const struct xfs_buf_ops *ops)
808 {
809         struct xfs_buf          *bp;            /* return value */
810         xfs_daddr_t             d;              /* real disk block address */
811         int                     error;
812
813         ASSERT(fsbno != NULLFSBLOCK);
814         d = XFS_FSB_TO_DADDR(mp, fsbno);
815         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
816                                    mp->m_bsize, lock, &bp, ops);
817         if (error)
818                 return error;
819         if (bp)
820                 xfs_buf_set_ref(bp, refval);
821         *bpp = bp;
822         return 0;
823 }
824
825 /*
826  * Read-ahead the block, don't wait for it, don't return a buffer.
827  * Long-form addressing.
828  */
829 /* ARGSUSED */
830 void
831 xfs_btree_reada_bufl(
832         struct xfs_mount        *mp,            /* file system mount point */
833         xfs_fsblock_t           fsbno,          /* file system block number */
834         xfs_extlen_t            count,          /* count of filesystem blocks */
835         const struct xfs_buf_ops *ops)
836 {
837         xfs_daddr_t             d;
838
839         ASSERT(fsbno != NULLFSBLOCK);
840         d = XFS_FSB_TO_DADDR(mp, fsbno);
841         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
842 }
843
844 /*
845  * Read-ahead the block, don't wait for it, don't return a buffer.
846  * Short-form addressing.
847  */
848 /* ARGSUSED */
849 void
850 xfs_btree_reada_bufs(
851         struct xfs_mount        *mp,            /* file system mount point */
852         xfs_agnumber_t          agno,           /* allocation group number */
853         xfs_agblock_t           agbno,          /* allocation group block number */
854         xfs_extlen_t            count,          /* count of filesystem blocks */
855         const struct xfs_buf_ops *ops)
856 {
857         xfs_daddr_t             d;
858
859         ASSERT(agno != NULLAGNUMBER);
860         ASSERT(agbno != NULLAGBLOCK);
861         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
862         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
863 }
864
865 STATIC int
866 xfs_btree_readahead_lblock(
867         struct xfs_btree_cur    *cur,
868         int                     lr,
869         struct xfs_btree_block  *block)
870 {
871         int                     rval = 0;
872         xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
873         xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
874
875         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
876                 xfs_btree_reada_bufl(cur->bc_mp, left, 1,
877                                      cur->bc_ops->buf_ops);
878                 rval++;
879         }
880
881         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
882                 xfs_btree_reada_bufl(cur->bc_mp, right, 1,
883                                      cur->bc_ops->buf_ops);
884                 rval++;
885         }
886
887         return rval;
888 }
889
890 STATIC int
891 xfs_btree_readahead_sblock(
892         struct xfs_btree_cur    *cur,
893         int                     lr,
894         struct xfs_btree_block *block)
895 {
896         int                     rval = 0;
897         xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
898         xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
899
900
901         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
902                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
903                                      left, 1, cur->bc_ops->buf_ops);
904                 rval++;
905         }
906
907         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
908                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
909                                      right, 1, cur->bc_ops->buf_ops);
910                 rval++;
911         }
912
913         return rval;
914 }
915
916 /*
917  * Read-ahead btree blocks, at the given level.
918  * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
919  */
920 STATIC int
921 xfs_btree_readahead(
922         struct xfs_btree_cur    *cur,           /* btree cursor */
923         int                     lev,            /* level in btree */
924         int                     lr)             /* left/right bits */
925 {
926         struct xfs_btree_block  *block;
927
928         /*
929          * No readahead needed if we are at the root level and the
930          * btree root is stored in the inode.
931          */
932         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
933             (lev == cur->bc_nlevels - 1))
934                 return 0;
935
936         if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
937                 return 0;
938
939         cur->bc_ra[lev] |= lr;
940         block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
941
942         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
943                 return xfs_btree_readahead_lblock(cur, lr, block);
944         return xfs_btree_readahead_sblock(cur, lr, block);
945 }
946
947 STATIC xfs_daddr_t
948 xfs_btree_ptr_to_daddr(
949         struct xfs_btree_cur    *cur,
950         union xfs_btree_ptr     *ptr)
951 {
952         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
953                 ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
954
955                 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
956         } else {
957                 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
958                 ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
959
960                 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
961                                         be32_to_cpu(ptr->s));
962         }
963 }
964
965 /*
966  * Readahead @count btree blocks at the given @ptr location.
967  *
968  * We don't need to care about long or short form btrees here as we have a
969  * method of converting the ptr directly to a daddr available to us.
970  */
971 STATIC void
972 xfs_btree_readahead_ptr(
973         struct xfs_btree_cur    *cur,
974         union xfs_btree_ptr     *ptr,
975         xfs_extlen_t            count)
976 {
977         xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
978                           xfs_btree_ptr_to_daddr(cur, ptr),
979                           cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
980 }
981
982 /*
983  * Set the buffer for level "lev" in the cursor to bp, releasing
984  * any previous buffer.
985  */
986 STATIC void
987 xfs_btree_setbuf(
988         xfs_btree_cur_t         *cur,   /* btree cursor */
989         int                     lev,    /* level in btree */
990         xfs_buf_t               *bp)    /* new buffer to set */
991 {
992         struct xfs_btree_block  *b;     /* btree block */
993
994         if (cur->bc_bufs[lev])
995                 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
996         cur->bc_bufs[lev] = bp;
997         cur->bc_ra[lev] = 0;
998
999         b = XFS_BUF_TO_BLOCK(bp);
1000         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1001                 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1002                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1003                 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1004                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1005         } else {
1006                 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1007                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1008                 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1009                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1010         }
1011 }
1012
1013 STATIC int
1014 xfs_btree_ptr_is_null(
1015         struct xfs_btree_cur    *cur,
1016         union xfs_btree_ptr     *ptr)
1017 {
1018         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1019                 return ptr->l == cpu_to_be64(NULLFSBLOCK);
1020         else
1021                 return ptr->s == cpu_to_be32(NULLAGBLOCK);
1022 }
1023
1024 STATIC void
1025 xfs_btree_set_ptr_null(
1026         struct xfs_btree_cur    *cur,
1027         union xfs_btree_ptr     *ptr)
1028 {
1029         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1030                 ptr->l = cpu_to_be64(NULLFSBLOCK);
1031         else
1032                 ptr->s = cpu_to_be32(NULLAGBLOCK);
1033 }
1034
1035 /*
1036  * Get/set/init sibling pointers
1037  */
1038 STATIC void
1039 xfs_btree_get_sibling(
1040         struct xfs_btree_cur    *cur,
1041         struct xfs_btree_block  *block,
1042         union xfs_btree_ptr     *ptr,
1043         int                     lr)
1044 {
1045         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1046
1047         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1048                 if (lr == XFS_BB_RIGHTSIB)
1049                         ptr->l = block->bb_u.l.bb_rightsib;
1050                 else
1051                         ptr->l = block->bb_u.l.bb_leftsib;
1052         } else {
1053                 if (lr == XFS_BB_RIGHTSIB)
1054                         ptr->s = block->bb_u.s.bb_rightsib;
1055                 else
1056                         ptr->s = block->bb_u.s.bb_leftsib;
1057         }
1058 }
1059
1060 STATIC void
1061 xfs_btree_set_sibling(
1062         struct xfs_btree_cur    *cur,
1063         struct xfs_btree_block  *block,
1064         union xfs_btree_ptr     *ptr,
1065         int                     lr)
1066 {
1067         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1068
1069         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1070                 if (lr == XFS_BB_RIGHTSIB)
1071                         block->bb_u.l.bb_rightsib = ptr->l;
1072                 else
1073                         block->bb_u.l.bb_leftsib = ptr->l;
1074         } else {
1075                 if (lr == XFS_BB_RIGHTSIB)
1076                         block->bb_u.s.bb_rightsib = ptr->s;
1077                 else
1078                         block->bb_u.s.bb_leftsib = ptr->s;
1079         }
1080 }
1081
1082 void
1083 xfs_btree_init_block_int(
1084         struct xfs_mount        *mp,
1085         struct xfs_btree_block  *buf,
1086         xfs_daddr_t             blkno,
1087         __u32                   magic,
1088         __u16                   level,
1089         __u16                   numrecs,
1090         __u64                   owner,
1091         unsigned int            flags)
1092 {
1093         int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
1094
1095         buf->bb_magic = cpu_to_be32(magic);
1096         buf->bb_level = cpu_to_be16(level);
1097         buf->bb_numrecs = cpu_to_be16(numrecs);
1098
1099         if (flags & XFS_BTREE_LONG_PTRS) {
1100                 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1101                 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1102                 if (crc) {
1103                         buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1104                         buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1105                         uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1106                         buf->bb_u.l.bb_pad = 0;
1107                         buf->bb_u.l.bb_lsn = 0;
1108                 }
1109         } else {
1110                 /* owner is a 32 bit value on short blocks */
1111                 __u32 __owner = (__u32)owner;
1112
1113                 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1114                 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1115                 if (crc) {
1116                         buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1117                         buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1118                         uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1119                         buf->bb_u.s.bb_lsn = 0;
1120                 }
1121         }
1122 }
1123
1124 void
1125 xfs_btree_init_block(
1126         struct xfs_mount *mp,
1127         struct xfs_buf  *bp,
1128         __u32           magic,
1129         __u16           level,
1130         __u16           numrecs,
1131         __u64           owner,
1132         unsigned int    flags)
1133 {
1134         xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1135                                  magic, level, numrecs, owner, flags);
1136 }
1137
1138 STATIC void
1139 xfs_btree_init_block_cur(
1140         struct xfs_btree_cur    *cur,
1141         struct xfs_buf          *bp,
1142         int                     level,
1143         int                     numrecs)
1144 {
1145         __u64 owner;
1146
1147         /*
1148          * we can pull the owner from the cursor right now as the different
1149          * owners align directly with the pointer size of the btree. This may
1150          * change in future, but is safe for current users of the generic btree
1151          * code.
1152          */
1153         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1154                 owner = cur->bc_private.b.ip->i_ino;
1155         else
1156                 owner = cur->bc_private.a.agno;
1157
1158         xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1159                                  xfs_btree_magic(cur), level, numrecs,
1160                                  owner, cur->bc_flags);
1161 }
1162
1163 /*
1164  * Return true if ptr is the last record in the btree and
1165  * we need to track updates to this record.  The decision
1166  * will be further refined in the update_lastrec method.
1167  */
1168 STATIC int
1169 xfs_btree_is_lastrec(
1170         struct xfs_btree_cur    *cur,
1171         struct xfs_btree_block  *block,
1172         int                     level)
1173 {
1174         union xfs_btree_ptr     ptr;
1175
1176         if (level > 0)
1177                 return 0;
1178         if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1179                 return 0;
1180
1181         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1182         if (!xfs_btree_ptr_is_null(cur, &ptr))
1183                 return 0;
1184         return 1;
1185 }
1186
1187 STATIC void
1188 xfs_btree_buf_to_ptr(
1189         struct xfs_btree_cur    *cur,
1190         struct xfs_buf          *bp,
1191         union xfs_btree_ptr     *ptr)
1192 {
1193         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1194                 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1195                                         XFS_BUF_ADDR(bp)));
1196         else {
1197                 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1198                                         XFS_BUF_ADDR(bp)));
1199         }
1200 }
1201
1202 STATIC void
1203 xfs_btree_set_refs(
1204         struct xfs_btree_cur    *cur,
1205         struct xfs_buf          *bp)
1206 {
1207         switch (cur->bc_btnum) {
1208         case XFS_BTNUM_BNO:
1209         case XFS_BTNUM_CNT:
1210                 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1211                 break;
1212         case XFS_BTNUM_INO:
1213         case XFS_BTNUM_FINO:
1214                 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1215                 break;
1216         case XFS_BTNUM_BMAP:
1217                 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1218                 break;
1219         case XFS_BTNUM_RMAP:
1220                 xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
1221                 break;
1222         case XFS_BTNUM_REFC:
1223                 xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
1224                 break;
1225         default:
1226                 ASSERT(0);
1227         }
1228 }
1229
1230 STATIC int
1231 xfs_btree_get_buf_block(
1232         struct xfs_btree_cur    *cur,
1233         union xfs_btree_ptr     *ptr,
1234         int                     flags,
1235         struct xfs_btree_block  **block,
1236         struct xfs_buf          **bpp)
1237 {
1238         struct xfs_mount        *mp = cur->bc_mp;
1239         xfs_daddr_t             d;
1240
1241         /* need to sort out how callers deal with failures first */
1242         ASSERT(!(flags & XBF_TRYLOCK));
1243
1244         d = xfs_btree_ptr_to_daddr(cur, ptr);
1245         *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1246                                  mp->m_bsize, flags);
1247
1248         if (!*bpp)
1249                 return -ENOMEM;
1250
1251         (*bpp)->b_ops = cur->bc_ops->buf_ops;
1252         *block = XFS_BUF_TO_BLOCK(*bpp);
1253         return 0;
1254 }
1255
1256 /*
1257  * Read in the buffer at the given ptr and return the buffer and
1258  * the block pointer within the buffer.
1259  */
1260 STATIC int
1261 xfs_btree_read_buf_block(
1262         struct xfs_btree_cur    *cur,
1263         union xfs_btree_ptr     *ptr,
1264         int                     flags,
1265         struct xfs_btree_block  **block,
1266         struct xfs_buf          **bpp)
1267 {
1268         struct xfs_mount        *mp = cur->bc_mp;
1269         xfs_daddr_t             d;
1270         int                     error;
1271
1272         /* need to sort out how callers deal with failures first */
1273         ASSERT(!(flags & XBF_TRYLOCK));
1274
1275         d = xfs_btree_ptr_to_daddr(cur, ptr);
1276         error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1277                                    mp->m_bsize, flags, bpp,
1278                                    cur->bc_ops->buf_ops);
1279         if (error)
1280                 return error;
1281
1282         xfs_btree_set_refs(cur, *bpp);
1283         *block = XFS_BUF_TO_BLOCK(*bpp);
1284         return 0;
1285 }
1286
1287 /*
1288  * Copy keys from one btree block to another.
1289  */
1290 STATIC void
1291 xfs_btree_copy_keys(
1292         struct xfs_btree_cur    *cur,
1293         union xfs_btree_key     *dst_key,
1294         union xfs_btree_key     *src_key,
1295         int                     numkeys)
1296 {
1297         ASSERT(numkeys >= 0);
1298         memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1299 }
1300
1301 /*
1302  * Copy records from one btree block to another.
1303  */
1304 STATIC void
1305 xfs_btree_copy_recs(
1306         struct xfs_btree_cur    *cur,
1307         union xfs_btree_rec     *dst_rec,
1308         union xfs_btree_rec     *src_rec,
1309         int                     numrecs)
1310 {
1311         ASSERT(numrecs >= 0);
1312         memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1313 }
1314
1315 /*
1316  * Copy block pointers from one btree block to another.
1317  */
1318 STATIC void
1319 xfs_btree_copy_ptrs(
1320         struct xfs_btree_cur    *cur,
1321         union xfs_btree_ptr     *dst_ptr,
1322         union xfs_btree_ptr     *src_ptr,
1323         int                     numptrs)
1324 {
1325         ASSERT(numptrs >= 0);
1326         memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1327 }
1328
1329 /*
1330  * Shift keys one index left/right inside a single btree block.
1331  */
1332 STATIC void
1333 xfs_btree_shift_keys(
1334         struct xfs_btree_cur    *cur,
1335         union xfs_btree_key     *key,
1336         int                     dir,
1337         int                     numkeys)
1338 {
1339         char                    *dst_key;
1340
1341         ASSERT(numkeys >= 0);
1342         ASSERT(dir == 1 || dir == -1);
1343
1344         dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1345         memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1346 }
1347
1348 /*
1349  * Shift records one index left/right inside a single btree block.
1350  */
1351 STATIC void
1352 xfs_btree_shift_recs(
1353         struct xfs_btree_cur    *cur,
1354         union xfs_btree_rec     *rec,
1355         int                     dir,
1356         int                     numrecs)
1357 {
1358         char                    *dst_rec;
1359
1360         ASSERT(numrecs >= 0);
1361         ASSERT(dir == 1 || dir == -1);
1362
1363         dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1364         memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1365 }
1366
1367 /*
1368  * Shift block pointers one index left/right inside a single btree block.
1369  */
1370 STATIC void
1371 xfs_btree_shift_ptrs(
1372         struct xfs_btree_cur    *cur,
1373         union xfs_btree_ptr     *ptr,
1374         int                     dir,
1375         int                     numptrs)
1376 {
1377         char                    *dst_ptr;
1378
1379         ASSERT(numptrs >= 0);
1380         ASSERT(dir == 1 || dir == -1);
1381
1382         dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1383         memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1384 }
1385
1386 /*
1387  * Log key values from the btree block.
1388  */
1389 STATIC void
1390 xfs_btree_log_keys(
1391         struct xfs_btree_cur    *cur,
1392         struct xfs_buf          *bp,
1393         int                     first,
1394         int                     last)
1395 {
1396         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1397         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1398
1399         if (bp) {
1400                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1401                 xfs_trans_log_buf(cur->bc_tp, bp,
1402                                   xfs_btree_key_offset(cur, first),
1403                                   xfs_btree_key_offset(cur, last + 1) - 1);
1404         } else {
1405                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1406                                 xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1407         }
1408
1409         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1410 }
1411
1412 /*
1413  * Log record values from the btree block.
1414  */
1415 void
1416 xfs_btree_log_recs(
1417         struct xfs_btree_cur    *cur,
1418         struct xfs_buf          *bp,
1419         int                     first,
1420         int                     last)
1421 {
1422         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1423         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1424
1425         xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1426         xfs_trans_log_buf(cur->bc_tp, bp,
1427                           xfs_btree_rec_offset(cur, first),
1428                           xfs_btree_rec_offset(cur, last + 1) - 1);
1429
1430         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1431 }
1432
1433 /*
1434  * Log block pointer fields from a btree block (nonleaf).
1435  */
1436 STATIC void
1437 xfs_btree_log_ptrs(
1438         struct xfs_btree_cur    *cur,   /* btree cursor */
1439         struct xfs_buf          *bp,    /* buffer containing btree block */
1440         int                     first,  /* index of first pointer to log */
1441         int                     last)   /* index of last pointer to log */
1442 {
1443         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1444         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1445
1446         if (bp) {
1447                 struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1448                 int                     level = xfs_btree_get_level(block);
1449
1450                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1451                 xfs_trans_log_buf(cur->bc_tp, bp,
1452                                 xfs_btree_ptr_offset(cur, first, level),
1453                                 xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1454         } else {
1455                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1456                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1457         }
1458
1459         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1460 }
1461
1462 /*
1463  * Log fields from a btree block header.
1464  */
1465 void
1466 xfs_btree_log_block(
1467         struct xfs_btree_cur    *cur,   /* btree cursor */
1468         struct xfs_buf          *bp,    /* buffer containing btree block */
1469         int                     fields) /* mask of fields: XFS_BB_... */
1470 {
1471         int                     first;  /* first byte offset logged */
1472         int                     last;   /* last byte offset logged */
1473         static const short      soffsets[] = {  /* table of offsets (short) */
1474                 offsetof(struct xfs_btree_block, bb_magic),
1475                 offsetof(struct xfs_btree_block, bb_level),
1476                 offsetof(struct xfs_btree_block, bb_numrecs),
1477                 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1478                 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1479                 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1480                 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1481                 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1482                 offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1483                 offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1484                 XFS_BTREE_SBLOCK_CRC_LEN
1485         };
1486         static const short      loffsets[] = {  /* table of offsets (long) */
1487                 offsetof(struct xfs_btree_block, bb_magic),
1488                 offsetof(struct xfs_btree_block, bb_level),
1489                 offsetof(struct xfs_btree_block, bb_numrecs),
1490                 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1491                 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1492                 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1493                 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1494                 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1495                 offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1496                 offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1497                 offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1498                 XFS_BTREE_LBLOCK_CRC_LEN
1499         };
1500
1501         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1502         XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
1503
1504         if (bp) {
1505                 int nbits;
1506
1507                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1508                         /*
1509                          * We don't log the CRC when updating a btree
1510                          * block but instead recreate it during log
1511                          * recovery.  As the log buffers have checksums
1512                          * of their own this is safe and avoids logging a crc
1513                          * update in a lot of places.
1514                          */
1515                         if (fields == XFS_BB_ALL_BITS)
1516                                 fields = XFS_BB_ALL_BITS_CRC;
1517                         nbits = XFS_BB_NUM_BITS_CRC;
1518                 } else {
1519                         nbits = XFS_BB_NUM_BITS;
1520                 }
1521                 xfs_btree_offsets(fields,
1522                                   (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1523                                         loffsets : soffsets,
1524                                   nbits, &first, &last);
1525                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1526                 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1527         } else {
1528                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1529                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1530         }
1531
1532         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1533 }
1534
1535 /*
1536  * Increment cursor by one record at the level.
1537  * For nonzero levels the leaf-ward information is untouched.
1538  */
1539 int                                             /* error */
1540 xfs_btree_increment(
1541         struct xfs_btree_cur    *cur,
1542         int                     level,
1543         int                     *stat)          /* success/failure */
1544 {
1545         struct xfs_btree_block  *block;
1546         union xfs_btree_ptr     ptr;
1547         struct xfs_buf          *bp;
1548         int                     error;          /* error return value */
1549         int                     lev;
1550
1551         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1552         XFS_BTREE_TRACE_ARGI(cur, level);
1553
1554         ASSERT(level < cur->bc_nlevels);
1555
1556         /* Read-ahead to the right at this level. */
1557         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1558
1559         /* Get a pointer to the btree block. */
1560         block = xfs_btree_get_block(cur, level, &bp);
1561
1562 #ifdef DEBUG
1563         error = xfs_btree_check_block(cur, block, level, bp);
1564         if (error)
1565                 goto error0;
1566 #endif
1567
1568         /* We're done if we remain in the block after the increment. */
1569         if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1570                 goto out1;
1571
1572         /* Fail if we just went off the right edge of the tree. */
1573         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1574         if (xfs_btree_ptr_is_null(cur, &ptr))
1575                 goto out0;
1576
1577         XFS_BTREE_STATS_INC(cur, increment);
1578
1579         /*
1580          * March up the tree incrementing pointers.
1581          * Stop when we don't go off the right edge of a block.
1582          */
1583         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1584                 block = xfs_btree_get_block(cur, lev, &bp);
1585
1586 #ifdef DEBUG
1587                 error = xfs_btree_check_block(cur, block, lev, bp);
1588                 if (error)
1589                         goto error0;
1590 #endif
1591
1592                 if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1593                         break;
1594
1595                 /* Read-ahead the right block for the next loop. */
1596                 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1597         }
1598
1599         /*
1600          * If we went off the root then we are either seriously
1601          * confused or have the tree root in an inode.
1602          */
1603         if (lev == cur->bc_nlevels) {
1604                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1605                         goto out0;
1606                 ASSERT(0);
1607                 error = -EFSCORRUPTED;
1608                 goto error0;
1609         }
1610         ASSERT(lev < cur->bc_nlevels);
1611
1612         /*
1613          * Now walk back down the tree, fixing up the cursor's buffer
1614          * pointers and key numbers.
1615          */
1616         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1617                 union xfs_btree_ptr     *ptrp;
1618
1619                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1620                 --lev;
1621                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1622                 if (error)
1623                         goto error0;
1624
1625                 xfs_btree_setbuf(cur, lev, bp);
1626                 cur->bc_ptrs[lev] = 1;
1627         }
1628 out1:
1629         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1630         *stat = 1;
1631         return 0;
1632
1633 out0:
1634         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1635         *stat = 0;
1636         return 0;
1637
1638 error0:
1639         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1640         return error;
1641 }
1642
1643 /*
1644  * Decrement cursor by one record at the level.
1645  * For nonzero levels the leaf-ward information is untouched.
1646  */
1647 int                                             /* error */
1648 xfs_btree_decrement(
1649         struct xfs_btree_cur    *cur,
1650         int                     level,
1651         int                     *stat)          /* success/failure */
1652 {
1653         struct xfs_btree_block  *block;
1654         xfs_buf_t               *bp;
1655         int                     error;          /* error return value */
1656         int                     lev;
1657         union xfs_btree_ptr     ptr;
1658
1659         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1660         XFS_BTREE_TRACE_ARGI(cur, level);
1661
1662         ASSERT(level < cur->bc_nlevels);
1663
1664         /* Read-ahead to the left at this level. */
1665         xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1666
1667         /* We're done if we remain in the block after the decrement. */
1668         if (--cur->bc_ptrs[level] > 0)
1669                 goto out1;
1670
1671         /* Get a pointer to the btree block. */
1672         block = xfs_btree_get_block(cur, level, &bp);
1673
1674 #ifdef DEBUG
1675         error = xfs_btree_check_block(cur, block, level, bp);
1676         if (error)
1677                 goto error0;
1678 #endif
1679
1680         /* Fail if we just went off the left edge of the tree. */
1681         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1682         if (xfs_btree_ptr_is_null(cur, &ptr))
1683                 goto out0;
1684
1685         XFS_BTREE_STATS_INC(cur, decrement);
1686
1687         /*
1688          * March up the tree decrementing pointers.
1689          * Stop when we don't go off the left edge of a block.
1690          */
1691         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1692                 if (--cur->bc_ptrs[lev] > 0)
1693                         break;
1694                 /* Read-ahead the left block for the next loop. */
1695                 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1696         }
1697
1698         /*
1699          * If we went off the root then we are seriously confused.
1700          * or the root of the tree is in an inode.
1701          */
1702         if (lev == cur->bc_nlevels) {
1703                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1704                         goto out0;
1705                 ASSERT(0);
1706                 error = -EFSCORRUPTED;
1707                 goto error0;
1708         }
1709         ASSERT(lev < cur->bc_nlevels);
1710
1711         /*
1712          * Now walk back down the tree, fixing up the cursor's buffer
1713          * pointers and key numbers.
1714          */
1715         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1716                 union xfs_btree_ptr     *ptrp;
1717
1718                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1719                 --lev;
1720                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1721                 if (error)
1722                         goto error0;
1723                 xfs_btree_setbuf(cur, lev, bp);
1724                 cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1725         }
1726 out1:
1727         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1728         *stat = 1;
1729         return 0;
1730
1731 out0:
1732         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1733         *stat = 0;
1734         return 0;
1735
1736 error0:
1737         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1738         return error;
1739 }
1740
1741 STATIC int
1742 xfs_btree_lookup_get_block(
1743         struct xfs_btree_cur    *cur,   /* btree cursor */
1744         int                     level,  /* level in the btree */
1745         union xfs_btree_ptr     *pp,    /* ptr to btree block */
1746         struct xfs_btree_block  **blkp) /* return btree block */
1747 {
1748         struct xfs_buf          *bp;    /* buffer pointer for btree block */
1749         int                     error = 0;
1750
1751         /* special case the root block if in an inode */
1752         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1753             (level == cur->bc_nlevels - 1)) {
1754                 *blkp = xfs_btree_get_iroot(cur);
1755                 return 0;
1756         }
1757
1758         /*
1759          * If the old buffer at this level for the disk address we are
1760          * looking for re-use it.
1761          *
1762          * Otherwise throw it away and get a new one.
1763          */
1764         bp = cur->bc_bufs[level];
1765         if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
1766                 *blkp = XFS_BUF_TO_BLOCK(bp);
1767                 return 0;
1768         }
1769
1770         error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1771         if (error)
1772                 return error;
1773
1774         /* Check the inode owner since the verifiers don't. */
1775         if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
1776             (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
1777             be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
1778                         cur->bc_private.b.ip->i_ino)
1779                 goto out_bad;
1780
1781         /* Did we get the level we were looking for? */
1782         if (be16_to_cpu((*blkp)->bb_level) != level)
1783                 goto out_bad;
1784
1785         /* Check that internal nodes have at least one record. */
1786         if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1787                 goto out_bad;
1788
1789         xfs_btree_setbuf(cur, level, bp);
1790         return 0;
1791
1792 out_bad:
1793         *blkp = NULL;
1794         xfs_trans_brelse(cur->bc_tp, bp);
1795         return -EFSCORRUPTED;
1796 }
1797
1798 /*
1799  * Get current search key.  For level 0 we don't actually have a key
1800  * structure so we make one up from the record.  For all other levels
1801  * we just return the right key.
1802  */
1803 STATIC union xfs_btree_key *
1804 xfs_lookup_get_search_key(
1805         struct xfs_btree_cur    *cur,
1806         int                     level,
1807         int                     keyno,
1808         struct xfs_btree_block  *block,
1809         union xfs_btree_key     *kp)
1810 {
1811         if (level == 0) {
1812                 cur->bc_ops->init_key_from_rec(kp,
1813                                 xfs_btree_rec_addr(cur, keyno, block));
1814                 return kp;
1815         }
1816
1817         return xfs_btree_key_addr(cur, keyno, block);
1818 }
1819
1820 /*
1821  * Lookup the record.  The cursor is made to point to it, based on dir.
1822  * stat is set to 0 if can't find any such record, 1 for success.
1823  */
1824 int                                     /* error */
1825 xfs_btree_lookup(
1826         struct xfs_btree_cur    *cur,   /* btree cursor */
1827         xfs_lookup_t            dir,    /* <=, ==, or >= */
1828         int                     *stat)  /* success/failure */
1829 {
1830         struct xfs_btree_block  *block; /* current btree block */
1831         __int64_t               diff;   /* difference for the current key */
1832         int                     error;  /* error return value */
1833         int                     keyno;  /* current key number */
1834         int                     level;  /* level in the btree */
1835         union xfs_btree_ptr     *pp;    /* ptr to btree block */
1836         union xfs_btree_ptr     ptr;    /* ptr to btree block */
1837
1838         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1839         XFS_BTREE_TRACE_ARGI(cur, dir);
1840
1841         XFS_BTREE_STATS_INC(cur, lookup);
1842
1843         /* No such thing as a zero-level tree. */
1844         if (cur->bc_nlevels == 0)
1845                 return -EFSCORRUPTED;
1846
1847         block = NULL;
1848         keyno = 0;
1849
1850         /* initialise start pointer from cursor */
1851         cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1852         pp = &ptr;
1853
1854         /*
1855          * Iterate over each level in the btree, starting at the root.
1856          * For each level above the leaves, find the key we need, based
1857          * on the lookup record, then follow the corresponding block
1858          * pointer down to the next level.
1859          */
1860         for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1861                 /* Get the block we need to do the lookup on. */
1862                 error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1863                 if (error)
1864                         goto error0;
1865
1866                 if (diff == 0) {
1867                         /*
1868                          * If we already had a key match at a higher level, we
1869                          * know we need to use the first entry in this block.
1870                          */
1871                         keyno = 1;
1872                 } else {
1873                         /* Otherwise search this block. Do a binary search. */
1874
1875                         int     high;   /* high entry number */
1876                         int     low;    /* low entry number */
1877
1878                         /* Set low and high entry numbers, 1-based. */
1879                         low = 1;
1880                         high = xfs_btree_get_numrecs(block);
1881                         if (!high) {
1882                                 /* Block is empty, must be an empty leaf. */
1883                                 ASSERT(level == 0 && cur->bc_nlevels == 1);
1884
1885                                 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1886                                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1887                                 *stat = 0;
1888                                 return 0;
1889                         }
1890
1891                         /* Binary search the block. */
1892                         while (low <= high) {
1893                                 union xfs_btree_key     key;
1894                                 union xfs_btree_key     *kp;
1895
1896                                 XFS_BTREE_STATS_INC(cur, compare);
1897
1898                                 /* keyno is average of low and high. */
1899                                 keyno = (low + high) >> 1;
1900
1901                                 /* Get current search key */
1902                                 kp = xfs_lookup_get_search_key(cur, level,
1903                                                 keyno, block, &key);
1904
1905                                 /*
1906                                  * Compute difference to get next direction:
1907                                  *  - less than, move right
1908                                  *  - greater than, move left
1909                                  *  - equal, we're done
1910                                  */
1911                                 diff = cur->bc_ops->key_diff(cur, kp);
1912                                 if (diff < 0)
1913                                         low = keyno + 1;
1914                                 else if (diff > 0)
1915                                         high = keyno - 1;
1916                                 else
1917                                         break;
1918                         }
1919                 }
1920
1921                 /*
1922                  * If there are more levels, set up for the next level
1923                  * by getting the block number and filling in the cursor.
1924                  */
1925                 if (level > 0) {
1926                         /*
1927                          * If we moved left, need the previous key number,
1928                          * unless there isn't one.
1929                          */
1930                         if (diff > 0 && --keyno < 1)
1931                                 keyno = 1;
1932                         pp = xfs_btree_ptr_addr(cur, keyno, block);
1933
1934 #ifdef DEBUG
1935                         error = xfs_btree_check_ptr(cur, pp, 0, level);
1936                         if (error)
1937                                 goto error0;
1938 #endif
1939                         cur->bc_ptrs[level] = keyno;
1940                 }
1941         }
1942
1943         /* Done with the search. See if we need to adjust the results. */
1944         if (dir != XFS_LOOKUP_LE && diff < 0) {
1945                 keyno++;
1946                 /*
1947                  * If ge search and we went off the end of the block, but it's
1948                  * not the last block, we're in the wrong block.
1949                  */
1950                 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1951                 if (dir == XFS_LOOKUP_GE &&
1952                     keyno > xfs_btree_get_numrecs(block) &&
1953                     !xfs_btree_ptr_is_null(cur, &ptr)) {
1954                         int     i;
1955
1956                         cur->bc_ptrs[0] = keyno;
1957                         error = xfs_btree_increment(cur, 0, &i);
1958                         if (error)
1959                                 goto error0;
1960                         XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1961                         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1962                         *stat = 1;
1963                         return 0;
1964                 }
1965         } else if (dir == XFS_LOOKUP_LE && diff > 0)
1966                 keyno--;
1967         cur->bc_ptrs[0] = keyno;
1968
1969         /* Return if we succeeded or not. */
1970         if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1971                 *stat = 0;
1972         else if (dir != XFS_LOOKUP_EQ || diff == 0)
1973                 *stat = 1;
1974         else
1975                 *stat = 0;
1976         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1977         return 0;
1978
1979 error0:
1980         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1981         return error;
1982 }
1983
1984 /* Find the high key storage area from a regular key. */
1985 STATIC union xfs_btree_key *
1986 xfs_btree_high_key_from_key(
1987         struct xfs_btree_cur    *cur,
1988         union xfs_btree_key     *key)
1989 {
1990         ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
1991         return (union xfs_btree_key *)((char *)key +
1992                         (cur->bc_ops->key_len / 2));
1993 }
1994
1995 /* Determine the low (and high if overlapped) keys of a leaf block */
1996 STATIC void
1997 xfs_btree_get_leaf_keys(
1998         struct xfs_btree_cur    *cur,
1999         struct xfs_btree_block  *block,
2000         union xfs_btree_key     *key)
2001 {
2002         union xfs_btree_key     max_hkey;
2003         union xfs_btree_key     hkey;
2004         union xfs_btree_rec     *rec;
2005         union xfs_btree_key     *high;
2006         int                     n;
2007
2008         rec = xfs_btree_rec_addr(cur, 1, block);
2009         cur->bc_ops->init_key_from_rec(key, rec);
2010
2011         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2012
2013                 cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
2014                 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2015                         rec = xfs_btree_rec_addr(cur, n, block);
2016                         cur->bc_ops->init_high_key_from_rec(&hkey, rec);
2017                         if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
2018                                         > 0)
2019                                 max_hkey = hkey;
2020                 }
2021
2022                 high = xfs_btree_high_key_from_key(cur, key);
2023                 memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2024         }
2025 }
2026
2027 /* Determine the low (and high if overlapped) keys of a node block */
2028 STATIC void
2029 xfs_btree_get_node_keys(
2030         struct xfs_btree_cur    *cur,
2031         struct xfs_btree_block  *block,
2032         union xfs_btree_key     *key)
2033 {
2034         union xfs_btree_key     *hkey;
2035         union xfs_btree_key     *max_hkey;
2036         union xfs_btree_key     *high;
2037         int                     n;
2038
2039         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2040                 memcpy(key, xfs_btree_key_addr(cur, 1, block),
2041                                 cur->bc_ops->key_len / 2);
2042
2043                 max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2044                 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2045                         hkey = xfs_btree_high_key_addr(cur, n, block);
2046                         if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
2047                                 max_hkey = hkey;
2048                 }
2049
2050                 high = xfs_btree_high_key_from_key(cur, key);
2051                 memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2052         } else {
2053                 memcpy(key, xfs_btree_key_addr(cur, 1, block),
2054                                 cur->bc_ops->key_len);
2055         }
2056 }
2057
2058 /* Derive the keys for any btree block. */
2059 STATIC void
2060 xfs_btree_get_keys(
2061         struct xfs_btree_cur    *cur,
2062         struct xfs_btree_block  *block,
2063         union xfs_btree_key     *key)
2064 {
2065         if (be16_to_cpu(block->bb_level) == 0)
2066                 xfs_btree_get_leaf_keys(cur, block, key);
2067         else
2068                 xfs_btree_get_node_keys(cur, block, key);
2069 }
2070
2071 /*
2072  * Decide if we need to update the parent keys of a btree block.  For
2073  * a standard btree this is only necessary if we're updating the first
2074  * record/key.  For an overlapping btree, we must always update the
2075  * keys because the highest key can be in any of the records or keys
2076  * in the block.
2077  */
2078 static inline bool
2079 xfs_btree_needs_key_update(
2080         struct xfs_btree_cur    *cur,
2081         int                     ptr)
2082 {
2083         return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
2084 }
2085
2086 /*
2087  * Update the low and high parent keys of the given level, progressing
2088  * towards the root.  If force_all is false, stop if the keys for a given
2089  * level do not need updating.
2090  */
2091 STATIC int
2092 __xfs_btree_updkeys(
2093         struct xfs_btree_cur    *cur,
2094         int                     level,
2095         struct xfs_btree_block  *block,
2096         struct xfs_buf          *bp0,
2097         bool                    force_all)
2098 {
2099         union xfs_btree_key     key;    /* keys from current level */
2100         union xfs_btree_key     *lkey;  /* keys from the next level up */
2101         union xfs_btree_key     *hkey;
2102         union xfs_btree_key     *nlkey; /* keys from the next level up */
2103         union xfs_btree_key     *nhkey;
2104         struct xfs_buf          *bp;
2105         int                     ptr;
2106
2107         ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2108
2109         /* Exit if there aren't any parent levels to update. */
2110         if (level + 1 >= cur->bc_nlevels)
2111                 return 0;
2112
2113         trace_xfs_btree_updkeys(cur, level, bp0);
2114
2115         lkey = &key;
2116         hkey = xfs_btree_high_key_from_key(cur, lkey);
2117         xfs_btree_get_keys(cur, block, lkey);
2118         for (level++; level < cur->bc_nlevels; level++) {
2119 #ifdef DEBUG
2120                 int             error;
2121 #endif
2122                 block = xfs_btree_get_block(cur, level, &bp);
2123                 trace_xfs_btree_updkeys(cur, level, bp);
2124 #ifdef DEBUG
2125                 error = xfs_btree_check_block(cur, block, level, bp);
2126                 if (error) {
2127                         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2128                         return error;
2129                 }
2130 #endif
2131                 ptr = cur->bc_ptrs[level];
2132                 nlkey = xfs_btree_key_addr(cur, ptr, block);
2133                 nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2134                 if (!force_all &&
2135                     !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
2136                       cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
2137                         break;
2138                 xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2139                 xfs_btree_log_keys(cur, bp, ptr, ptr);
2140                 if (level + 1 >= cur->bc_nlevels)
2141                         break;
2142                 xfs_btree_get_node_keys(cur, block, lkey);
2143         }
2144
2145         return 0;
2146 }
2147
2148 /* Update all the keys from some level in cursor back to the root. */
2149 STATIC int
2150 xfs_btree_updkeys_force(
2151         struct xfs_btree_cur    *cur,
2152         int                     level)
2153 {
2154         struct xfs_buf          *bp;
2155         struct xfs_btree_block  *block;
2156
2157         block = xfs_btree_get_block(cur, level, &bp);
2158         return __xfs_btree_updkeys(cur, level, block, bp, true);
2159 }
2160
2161 /*
2162  * Update the parent keys of the given level, progressing towards the root.
2163  */
2164 STATIC int
2165 xfs_btree_update_keys(
2166         struct xfs_btree_cur    *cur,
2167         int                     level)
2168 {
2169         struct xfs_btree_block  *block;
2170         struct xfs_buf          *bp;
2171         union xfs_btree_key     *kp;
2172         union xfs_btree_key     key;
2173         int                     ptr;
2174
2175         ASSERT(level >= 0);
2176
2177         block = xfs_btree_get_block(cur, level, &bp);
2178         if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
2179                 return __xfs_btree_updkeys(cur, level, block, bp, false);
2180
2181         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2182         XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
2183
2184         /*
2185          * Go up the tree from this level toward the root.
2186          * At each level, update the key value to the value input.
2187          * Stop when we reach a level where the cursor isn't pointing
2188          * at the first entry in the block.
2189          */
2190         xfs_btree_get_keys(cur, block, &key);
2191         for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2192 #ifdef DEBUG
2193                 int             error;
2194 #endif
2195                 block = xfs_btree_get_block(cur, level, &bp);
2196 #ifdef DEBUG
2197                 error = xfs_btree_check_block(cur, block, level, bp);
2198                 if (error) {
2199                         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2200                         return error;
2201                 }
2202 #endif
2203                 ptr = cur->bc_ptrs[level];
2204                 kp = xfs_btree_key_addr(cur, ptr, block);
2205                 xfs_btree_copy_keys(cur, kp, &key, 1);
2206                 xfs_btree_log_keys(cur, bp, ptr, ptr);
2207         }
2208
2209         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2210         return 0;
2211 }
2212
2213 /*
2214  * Update the record referred to by cur to the value in the
2215  * given record. This either works (return 0) or gets an
2216  * EFSCORRUPTED error.
2217  */
2218 int
2219 xfs_btree_update(
2220         struct xfs_btree_cur    *cur,
2221         union xfs_btree_rec     *rec)
2222 {
2223         struct xfs_btree_block  *block;
2224         struct xfs_buf          *bp;
2225         int                     error;
2226         int                     ptr;
2227         union xfs_btree_rec     *rp;
2228
2229         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2230         XFS_BTREE_TRACE_ARGR(cur, rec);
2231
2232         /* Pick up the current block. */
2233         block = xfs_btree_get_block(cur, 0, &bp);
2234
2235 #ifdef DEBUG
2236         error = xfs_btree_check_block(cur, block, 0, bp);
2237         if (error)
2238                 goto error0;
2239 #endif
2240         /* Get the address of the rec to be updated. */
2241         ptr = cur->bc_ptrs[0];
2242         rp = xfs_btree_rec_addr(cur, ptr, block);
2243
2244         /* Fill in the new contents and log them. */
2245         xfs_btree_copy_recs(cur, rp, rec, 1);
2246         xfs_btree_log_recs(cur, bp, ptr, ptr);
2247
2248         /*
2249          * If we are tracking the last record in the tree and
2250          * we are at the far right edge of the tree, update it.
2251          */
2252         if (xfs_btree_is_lastrec(cur, block, 0)) {
2253                 cur->bc_ops->update_lastrec(cur, block, rec,
2254                                             ptr, LASTREC_UPDATE);
2255         }
2256
2257         /* Pass new key value up to our parent. */
2258         if (xfs_btree_needs_key_update(cur, ptr)) {
2259                 error = xfs_btree_update_keys(cur, 0);
2260                 if (error)
2261                         goto error0;
2262         }
2263
2264         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2265         return 0;
2266
2267 error0:
2268         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2269         return error;
2270 }
2271
2272 /*
2273  * Move 1 record left from cur/level if possible.
2274  * Update cur to reflect the new path.
2275  */
2276 STATIC int                                      /* error */
2277 xfs_btree_lshift(
2278         struct xfs_btree_cur    *cur,
2279         int                     level,
2280         int                     *stat)          /* success/failure */
2281 {
2282         struct xfs_buf          *lbp;           /* left buffer pointer */
2283         struct xfs_btree_block  *left;          /* left btree block */
2284         int                     lrecs;          /* left record count */
2285         struct xfs_buf          *rbp;           /* right buffer pointer */
2286         struct xfs_btree_block  *right;         /* right btree block */
2287         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2288         int                     rrecs;          /* right record count */
2289         union xfs_btree_ptr     lptr;           /* left btree pointer */
2290         union xfs_btree_key     *rkp = NULL;    /* right btree key */
2291         union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
2292         union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
2293         int                     error;          /* error return value */
2294         int                     i;
2295
2296         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2297         XFS_BTREE_TRACE_ARGI(cur, level);
2298
2299         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2300             level == cur->bc_nlevels - 1)
2301                 goto out0;
2302
2303         /* Set up variables for this block as "right". */
2304         right = xfs_btree_get_block(cur, level, &rbp);
2305
2306 #ifdef DEBUG
2307         error = xfs_btree_check_block(cur, right, level, rbp);
2308         if (error)
2309                 goto error0;
2310 #endif
2311
2312         /* If we've got no left sibling then we can't shift an entry left. */
2313         xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2314         if (xfs_btree_ptr_is_null(cur, &lptr))
2315                 goto out0;
2316
2317         /*
2318          * If the cursor entry is the one that would be moved, don't
2319          * do it... it's too complicated.
2320          */
2321         if (cur->bc_ptrs[level] <= 1)
2322                 goto out0;
2323
2324         /* Set up the left neighbor as "left". */
2325         error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2326         if (error)
2327                 goto error0;
2328
2329         /* If it's full, it can't take another entry. */
2330         lrecs = xfs_btree_get_numrecs(left);
2331         if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2332                 goto out0;
2333
2334         rrecs = xfs_btree_get_numrecs(right);
2335
2336         /*
2337          * We add one entry to the left side and remove one for the right side.
2338          * Account for it here, the changes will be updated on disk and logged
2339          * later.
2340          */
2341         lrecs++;
2342         rrecs--;
2343
2344         XFS_BTREE_STATS_INC(cur, lshift);
2345         XFS_BTREE_STATS_ADD(cur, moves, 1);
2346
2347         /*
2348          * If non-leaf, copy a key and a ptr to the left block.
2349          * Log the changes to the left block.
2350          */
2351         if (level > 0) {
2352                 /* It's a non-leaf.  Move keys and pointers. */
2353                 union xfs_btree_key     *lkp;   /* left btree key */
2354                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2355
2356                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2357                 rkp = xfs_btree_key_addr(cur, 1, right);
2358
2359                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2360                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2361 #ifdef DEBUG
2362                 error = xfs_btree_check_ptr(cur, rpp, 0, level);
2363                 if (error)
2364                         goto error0;
2365 #endif
2366                 xfs_btree_copy_keys(cur, lkp, rkp, 1);
2367                 xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2368
2369                 xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2370                 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2371
2372                 ASSERT(cur->bc_ops->keys_inorder(cur,
2373                         xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2374         } else {
2375                 /* It's a leaf.  Move records.  */
2376                 union xfs_btree_rec     *lrp;   /* left record pointer */
2377
2378                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2379                 rrp = xfs_btree_rec_addr(cur, 1, right);
2380
2381                 xfs_btree_copy_recs(cur, lrp, rrp, 1);
2382                 xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2383
2384                 ASSERT(cur->bc_ops->recs_inorder(cur,
2385                         xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2386         }
2387
2388         xfs_btree_set_numrecs(left, lrecs);
2389         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2390
2391         xfs_btree_set_numrecs(right, rrecs);
2392         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2393
2394         /*
2395          * Slide the contents of right down one entry.
2396          */
2397         XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2398         if (level > 0) {
2399                 /* It's a nonleaf. operate on keys and ptrs */
2400 #ifdef DEBUG
2401                 int                     i;              /* loop index */
2402
2403                 for (i = 0; i < rrecs; i++) {
2404                         error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
2405                         if (error)
2406                                 goto error0;
2407                 }
2408 #endif
2409                 xfs_btree_shift_keys(cur,
2410                                 xfs_btree_key_addr(cur, 2, right),
2411                                 -1, rrecs);
2412                 xfs_btree_shift_ptrs(cur,
2413                                 xfs_btree_ptr_addr(cur, 2, right),
2414                                 -1, rrecs);
2415
2416                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2417                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2418         } else {
2419                 /* It's a leaf. operate on records */
2420                 xfs_btree_shift_recs(cur,
2421                         xfs_btree_rec_addr(cur, 2, right),
2422                         -1, rrecs);
2423                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2424         }
2425
2426         /*
2427          * Using a temporary cursor, update the parent key values of the
2428          * block on the left.
2429          */
2430         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2431                 error = xfs_btree_dup_cursor(cur, &tcur);
2432                 if (error)
2433                         goto error0;
2434                 i = xfs_btree_firstrec(tcur, level);
2435                 XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2436
2437                 error = xfs_btree_decrement(tcur, level, &i);
2438                 if (error)
2439                         goto error1;
2440
2441                 /* Update the parent high keys of the left block, if needed. */
2442                 error = xfs_btree_update_keys(tcur, level);
2443                 if (error)
2444                         goto error1;
2445
2446                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2447         }
2448
2449         /* Update the parent keys of the right block. */
2450         error = xfs_btree_update_keys(cur, level);
2451         if (error)
2452                 goto error0;
2453
2454         /* Slide the cursor value left one. */
2455         cur->bc_ptrs[level]--;
2456
2457         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2458         *stat = 1;
2459         return 0;
2460
2461 out0:
2462         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2463         *stat = 0;
2464         return 0;
2465
2466 error0:
2467         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2468         return error;
2469
2470 error1:
2471         XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2472         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2473         return error;
2474 }
2475
2476 /*
2477  * Move 1 record right from cur/level if possible.
2478  * Update cur to reflect the new path.
2479  */
2480 STATIC int                                      /* error */
2481 xfs_btree_rshift(
2482         struct xfs_btree_cur    *cur,
2483         int                     level,
2484         int                     *stat)          /* success/failure */
2485 {
2486         struct xfs_buf          *lbp;           /* left buffer pointer */
2487         struct xfs_btree_block  *left;          /* left btree block */
2488         struct xfs_buf          *rbp;           /* right buffer pointer */
2489         struct xfs_btree_block  *right;         /* right btree block */
2490         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2491         union xfs_btree_ptr     rptr;           /* right block pointer */
2492         union xfs_btree_key     *rkp;           /* right btree key */
2493         int                     rrecs;          /* right record count */
2494         int                     lrecs;          /* left record count */
2495         int                     error;          /* error return value */
2496         int                     i;              /* loop counter */
2497
2498         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2499         XFS_BTREE_TRACE_ARGI(cur, level);
2500
2501         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2502             (level == cur->bc_nlevels - 1))
2503                 goto out0;
2504
2505         /* Set up variables for this block as "left". */
2506         left = xfs_btree_get_block(cur, level, &lbp);
2507
2508 #ifdef DEBUG
2509         error = xfs_btree_check_block(cur, left, level, lbp);
2510         if (error)
2511                 goto error0;
2512 #endif
2513
2514         /* If we've got no right sibling then we can't shift an entry right. */
2515         xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2516         if (xfs_btree_ptr_is_null(cur, &rptr))
2517                 goto out0;
2518
2519         /*
2520          * If the cursor entry is the one that would be moved, don't
2521          * do it... it's too complicated.
2522          */
2523         lrecs = xfs_btree_get_numrecs(left);
2524         if (cur->bc_ptrs[level] >= lrecs)
2525                 goto out0;
2526
2527         /* Set up the right neighbor as "right". */
2528         error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2529         if (error)
2530                 goto error0;
2531
2532         /* If it's full, it can't take another entry. */
2533         rrecs = xfs_btree_get_numrecs(right);
2534         if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2535                 goto out0;
2536
2537         XFS_BTREE_STATS_INC(cur, rshift);
2538         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2539
2540         /*
2541          * Make a hole at the start of the right neighbor block, then
2542          * copy the last left block entry to the hole.
2543          */
2544         if (level > 0) {
2545                 /* It's a nonleaf. make a hole in the keys and ptrs */
2546                 union xfs_btree_key     *lkp;
2547                 union xfs_btree_ptr     *lpp;
2548                 union xfs_btree_ptr     *rpp;
2549
2550                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2551                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2552                 rkp = xfs_btree_key_addr(cur, 1, right);
2553                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2554
2555 #ifdef DEBUG
2556                 for (i = rrecs - 1; i >= 0; i--) {
2557                         error = xfs_btree_check_ptr(cur, rpp, i, level);
2558                         if (error)
2559                                 goto error0;
2560                 }
2561 #endif
2562
2563                 xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2564                 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2565
2566 #ifdef DEBUG
2567                 error = xfs_btree_check_ptr(cur, lpp, 0, level);
2568                 if (error)
2569                         goto error0;
2570 #endif
2571
2572                 /* Now put the new data in, and log it. */
2573                 xfs_btree_copy_keys(cur, rkp, lkp, 1);
2574                 xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2575
2576                 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2577                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2578
2579                 ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2580                         xfs_btree_key_addr(cur, 2, right)));
2581         } else {
2582                 /* It's a leaf. make a hole in the records */
2583                 union xfs_btree_rec     *lrp;
2584                 union xfs_btree_rec     *rrp;
2585
2586                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2587                 rrp = xfs_btree_rec_addr(cur, 1, right);
2588
2589                 xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2590
2591                 /* Now put the new data in, and log it. */
2592                 xfs_btree_copy_recs(cur, rrp, lrp, 1);
2593                 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2594         }
2595
2596         /*
2597          * Decrement and log left's numrecs, bump and log right's numrecs.
2598          */
2599         xfs_btree_set_numrecs(left, --lrecs);
2600         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2601
2602         xfs_btree_set_numrecs(right, ++rrecs);
2603         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2604
2605         /*
2606          * Using a temporary cursor, update the parent key values of the
2607          * block on the right.
2608          */
2609         error = xfs_btree_dup_cursor(cur, &tcur);
2610         if (error)
2611                 goto error0;
2612         i = xfs_btree_lastrec(tcur, level);
2613         XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2614
2615         error = xfs_btree_increment(tcur, level, &i);
2616         if (error)
2617                 goto error1;
2618
2619         /* Update the parent high keys of the left block, if needed. */
2620         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2621                 error = xfs_btree_update_keys(cur, level);
2622                 if (error)
2623                         goto error1;
2624         }
2625
2626         /* Update the parent keys of the right block. */
2627         error = xfs_btree_update_keys(tcur, level);
2628         if (error)
2629                 goto error1;
2630
2631         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2632
2633         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2634         *stat = 1;
2635         return 0;
2636
2637 out0:
2638         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2639         *stat = 0;
2640         return 0;
2641
2642 error0:
2643         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2644         return error;
2645
2646 error1:
2647         XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2648         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2649         return error;
2650 }
2651
2652 /*
2653  * Split cur/level block in half.
2654  * Return new block number and the key to its first
2655  * record (to be inserted into parent).
2656  */
2657 STATIC int                                      /* error */
2658 __xfs_btree_split(
2659         struct xfs_btree_cur    *cur,
2660         int                     level,
2661         union xfs_btree_ptr     *ptrp,
2662         union xfs_btree_key     *key,
2663         struct xfs_btree_cur    **curp,
2664         int                     *stat)          /* success/failure */
2665 {
2666         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2667         struct xfs_buf          *lbp;           /* left buffer pointer */
2668         struct xfs_btree_block  *left;          /* left btree block */
2669         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2670         struct xfs_buf          *rbp;           /* right buffer pointer */
2671         struct xfs_btree_block  *right;         /* right btree block */
2672         union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2673         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2674         struct xfs_btree_block  *rrblock;       /* right-right btree block */
2675         int                     lrecs;
2676         int                     rrecs;
2677         int                     src_index;
2678         int                     error;          /* error return value */
2679 #ifdef DEBUG
2680         int                     i;
2681 #endif
2682
2683         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2684         XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
2685
2686         XFS_BTREE_STATS_INC(cur, split);
2687
2688         /* Set up left block (current one). */
2689         left = xfs_btree_get_block(cur, level, &lbp);
2690
2691 #ifdef DEBUG
2692         error = xfs_btree_check_block(cur, left, level, lbp);
2693         if (error)
2694                 goto error0;
2695 #endif
2696
2697         xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2698
2699         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2700         error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2701         if (error)
2702                 goto error0;
2703         if (*stat == 0)
2704                 goto out0;
2705         XFS_BTREE_STATS_INC(cur, alloc);
2706
2707         /* Set up the new block as "right". */
2708         error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
2709         if (error)
2710                 goto error0;
2711
2712         /* Fill in the btree header for the new right block. */
2713         xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2714
2715         /*
2716          * Split the entries between the old and the new block evenly.
2717          * Make sure that if there's an odd number of entries now, that
2718          * each new block will have the same number of entries.
2719          */
2720         lrecs = xfs_btree_get_numrecs(left);
2721         rrecs = lrecs / 2;
2722         if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2723                 rrecs++;
2724         src_index = (lrecs - rrecs + 1);
2725
2726         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2727
2728         /* Adjust numrecs for the later get_*_keys() calls. */
2729         lrecs -= rrecs;
2730         xfs_btree_set_numrecs(left, lrecs);
2731         xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2732
2733         /*
2734          * Copy btree block entries from the left block over to the
2735          * new block, the right. Update the right block and log the
2736          * changes.
2737          */
2738         if (level > 0) {
2739                 /* It's a non-leaf.  Move keys and pointers. */
2740                 union xfs_btree_key     *lkp;   /* left btree key */
2741                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2742                 union xfs_btree_key     *rkp;   /* right btree key */
2743                 union xfs_btree_ptr     *rpp;   /* right address pointer */
2744
2745                 lkp = xfs_btree_key_addr(cur, src_index, left);
2746                 lpp = xfs_btree_ptr_addr(cur, src_index, left);
2747                 rkp = xfs_btree_key_addr(cur, 1, right);
2748                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2749
2750 #ifdef DEBUG
2751                 for (i = src_index; i < rrecs; i++) {
2752                         error = xfs_btree_check_ptr(cur, lpp, i, level);
2753                         if (error)
2754                                 goto error0;
2755                 }
2756 #endif
2757
2758                 /* Copy the keys & pointers to the new block. */
2759                 xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2760                 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2761
2762                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2763                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2764
2765                 /* Stash the keys of the new block for later insertion. */
2766                 xfs_btree_get_node_keys(cur, right, key);
2767         } else {
2768                 /* It's a leaf.  Move records.  */
2769                 union xfs_btree_rec     *lrp;   /* left record pointer */
2770                 union xfs_btree_rec     *rrp;   /* right record pointer */
2771
2772                 lrp = xfs_btree_rec_addr(cur, src_index, left);
2773                 rrp = xfs_btree_rec_addr(cur, 1, right);
2774
2775                 /* Copy records to the new block. */
2776                 xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2777                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2778
2779                 /* Stash the keys of the new block for later insertion. */
2780                 xfs_btree_get_leaf_keys(cur, right, key);
2781         }
2782
2783         /*
2784          * Find the left block number by looking in the buffer.
2785          * Adjust sibling pointers.
2786          */
2787         xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2788         xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2789         xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2790         xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2791
2792         xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2793         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2794
2795         /*
2796          * If there's a block to the new block's right, make that block
2797          * point back to right instead of to left.
2798          */
2799         if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2800                 error = xfs_btree_read_buf_block(cur, &rrptr,
2801                                                         0, &rrblock, &rrbp);
2802                 if (error)
2803                         goto error0;
2804                 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2805                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2806         }
2807
2808         /* Update the parent high keys of the left block, if needed. */
2809         if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2810                 error = xfs_btree_update_keys(cur, level);
2811                 if (error)
2812                         goto error0;
2813         }
2814
2815         /*
2816          * If the cursor is really in the right block, move it there.
2817          * If it's just pointing past the last entry in left, then we'll
2818          * insert there, so don't change anything in that case.
2819          */
2820         if (cur->bc_ptrs[level] > lrecs + 1) {
2821                 xfs_btree_setbuf(cur, level, rbp);
2822                 cur->bc_ptrs[level] -= lrecs;
2823         }
2824         /*
2825          * If there are more levels, we'll need another cursor which refers
2826          * the right block, no matter where this cursor was.
2827          */
2828         if (level + 1 < cur->bc_nlevels) {
2829                 error = xfs_btree_dup_cursor(cur, curp);
2830                 if (error)
2831                         goto error0;
2832                 (*curp)->bc_ptrs[level + 1]++;
2833         }
2834         *ptrp = rptr;
2835         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2836         *stat = 1;
2837         return 0;
2838 out0:
2839         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2840         *stat = 0;
2841         return 0;
2842
2843 error0:
2844         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2845         return error;
2846 }
2847
2848 struct xfs_btree_split_args {
2849         struct xfs_btree_cur    *cur;
2850         int                     level;
2851         union xfs_btree_ptr     *ptrp;
2852         union xfs_btree_key     *key;
2853         struct xfs_btree_cur    **curp;
2854         int                     *stat;          /* success/failure */
2855         int                     result;
2856         bool                    kswapd; /* allocation in kswapd context */
2857         struct completion       *done;
2858         struct work_struct      work;
2859 };
2860
2861 /*
2862  * Stack switching interfaces for allocation
2863  */
2864 static void
2865 xfs_btree_split_worker(
2866         struct work_struct      *work)
2867 {
2868         struct xfs_btree_split_args     *args = container_of(work,
2869                                                 struct xfs_btree_split_args, work);
2870         unsigned long           pflags;
2871         unsigned long           new_pflags = PF_FSTRANS;
2872
2873         /*
2874          * we are in a transaction context here, but may also be doing work
2875          * in kswapd context, and hence we may need to inherit that state
2876          * temporarily to ensure that we don't block waiting for memory reclaim
2877          * in any way.
2878          */
2879         if (args->kswapd)
2880                 new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2881
2882         current_set_flags_nested(&pflags, new_pflags);
2883
2884         args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2885                                          args->key, args->curp, args->stat);
2886         complete(args->done);
2887
2888         current_restore_flags_nested(&pflags, new_pflags);
2889 }
2890
2891 /*
2892  * BMBT split requests often come in with little stack to work on. Push
2893  * them off to a worker thread so there is lots of stack to use. For the other
2894  * btree types, just call directly to avoid the context switch overhead here.
2895  */
2896 STATIC int                                      /* error */
2897 xfs_btree_split(
2898         struct xfs_btree_cur    *cur,
2899         int                     level,
2900         union xfs_btree_ptr     *ptrp,
2901         union xfs_btree_key     *key,
2902         struct xfs_btree_cur    **curp,
2903         int                     *stat)          /* success/failure */
2904 {
2905         struct xfs_btree_split_args     args;
2906         DECLARE_COMPLETION_ONSTACK(done);
2907
2908         if (cur->bc_btnum != XFS_BTNUM_BMAP)
2909                 return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2910
2911         args.cur = cur;
2912         args.level = level;
2913         args.ptrp = ptrp;
2914         args.key = key;
2915         args.curp = curp;
2916         args.stat = stat;
2917         args.done = &done;
2918         args.kswapd = current_is_kswapd();
2919         INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2920         queue_work(xfs_alloc_wq, &args.work);
2921         wait_for_completion(&done);
2922         destroy_work_on_stack(&args.work);
2923         return args.result;
2924 }
2925
2926
2927 /*
2928  * Copy the old inode root contents into a real block and make the
2929  * broot point to it.
2930  */
2931 int                                             /* error */
2932 xfs_btree_new_iroot(
2933         struct xfs_btree_cur    *cur,           /* btree cursor */
2934         int                     *logflags,      /* logging flags for inode */
2935         int                     *stat)          /* return status - 0 fail */
2936 {
2937         struct xfs_buf          *cbp;           /* buffer for cblock */
2938         struct xfs_btree_block  *block;         /* btree block */
2939         struct xfs_btree_block  *cblock;        /* child btree block */
2940         union xfs_btree_key     *ckp;           /* child key pointer */
2941         union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2942         union xfs_btree_key     *kp;            /* pointer to btree key */
2943         union xfs_btree_ptr     *pp;            /* pointer to block addr */
2944         union xfs_btree_ptr     nptr;           /* new block addr */
2945         int                     level;          /* btree level */
2946         int                     error;          /* error return code */
2947 #ifdef DEBUG
2948         int                     i;              /* loop counter */
2949 #endif
2950
2951         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2952         XFS_BTREE_STATS_INC(cur, newroot);
2953
2954         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2955
2956         level = cur->bc_nlevels - 1;
2957
2958         block = xfs_btree_get_iroot(cur);
2959         pp = xfs_btree_ptr_addr(cur, 1, block);
2960
2961         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2962         error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2963         if (error)
2964                 goto error0;
2965         if (*stat == 0) {
2966                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2967                 return 0;
2968         }
2969         XFS_BTREE_STATS_INC(cur, alloc);
2970
2971         /* Copy the root into a real block. */
2972         error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
2973         if (error)
2974                 goto error0;
2975
2976         /*
2977          * we can't just memcpy() the root in for CRC enabled btree blocks.
2978          * In that case have to also ensure the blkno remains correct
2979          */
2980         memcpy(cblock, block, xfs_btree_block_len(cur));
2981         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2982                 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2983                         cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2984                 else
2985                         cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2986         }
2987
2988         be16_add_cpu(&block->bb_level, 1);
2989         xfs_btree_set_numrecs(block, 1);
2990         cur->bc_nlevels++;
2991         cur->bc_ptrs[level + 1] = 1;
2992
2993         kp = xfs_btree_key_addr(cur, 1, block);
2994         ckp = xfs_btree_key_addr(cur, 1, cblock);
2995         xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2996
2997         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2998 #ifdef DEBUG
2999         for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
3000                 error = xfs_btree_check_ptr(cur, pp, i, level);
3001                 if (error)
3002                         goto error0;
3003         }
3004 #endif
3005         xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
3006
3007 #ifdef DEBUG
3008         error = xfs_btree_check_ptr(cur, &nptr, 0, level);
3009         if (error)
3010                 goto error0;
3011 #endif
3012         xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
3013
3014         xfs_iroot_realloc(cur->bc_private.b.ip,
3015                           1 - xfs_btree_get_numrecs(cblock),
3016                           cur->bc_private.b.whichfork);
3017
3018         xfs_btree_setbuf(cur, level, cbp);
3019
3020         /*
3021          * Do all this logging at the end so that
3022          * the root is at the right level.
3023          */
3024         xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
3025         xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3026         xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3027
3028         *logflags |=
3029                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
3030         *stat = 1;
3031         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3032         return 0;
3033 error0:
3034         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3035         return error;
3036 }
3037
3038 /*
3039  * Allocate a new root block, fill it in.
3040  */
3041 STATIC int                              /* error */
3042 xfs_btree_new_root(
3043         struct xfs_btree_cur    *cur,   /* btree cursor */
3044         int                     *stat)  /* success/failure */
3045 {
3046         struct xfs_btree_block  *block; /* one half of the old root block */
3047         struct xfs_buf          *bp;    /* buffer containing block */
3048         int                     error;  /* error return value */
3049         struct xfs_buf          *lbp;   /* left buffer pointer */
3050         struct xfs_btree_block  *left;  /* left btree block */
3051         struct xfs_buf          *nbp;   /* new (root) buffer */
3052         struct xfs_btree_block  *new;   /* new (root) btree block */
3053         int                     nptr;   /* new value for key index, 1 or 2 */
3054         struct xfs_buf          *rbp;   /* right buffer pointer */
3055         struct xfs_btree_block  *right; /* right btree block */
3056         union xfs_btree_ptr     rptr;
3057         union xfs_btree_ptr     lptr;
3058
3059         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3060         XFS_BTREE_STATS_INC(cur, newroot);
3061
3062         /* initialise our start point from the cursor */
3063         cur->bc_ops->init_ptr_from_cur(cur, &rptr);
3064
3065         /* Allocate the new block. If we can't do it, we're toast. Give up. */
3066         error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
3067         if (error)
3068                 goto error0;
3069         if (*stat == 0)
3070                 goto out0;
3071         XFS_BTREE_STATS_INC(cur, alloc);
3072
3073         /* Set up the new block. */
3074         error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
3075         if (error)
3076                 goto error0;
3077
3078         /* Set the root in the holding structure  increasing the level by 1. */
3079         cur->bc_ops->set_root(cur, &lptr, 1);
3080
3081         /*
3082          * At the previous root level there are now two blocks: the old root,
3083          * and the new block generated when it was split.  We don't know which
3084          * one the cursor is pointing at, so we set up variables "left" and
3085          * "right" for each case.
3086          */
3087         block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3088
3089 #ifdef DEBUG
3090         error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3091         if (error)
3092                 goto error0;
3093 #endif
3094
3095         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3096         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3097                 /* Our block is left, pick up the right block. */
3098                 lbp = bp;
3099                 xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3100                 left = block;
3101                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3102                 if (error)
3103                         goto error0;
3104                 bp = rbp;
3105                 nptr = 1;
3106         } else {
3107                 /* Our block is right, pick up the left block. */
3108                 rbp = bp;
3109                 xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3110                 right = block;
3111                 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3112                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3113                 if (error)
3114                         goto error0;
3115                 bp = lbp;
3116                 nptr = 2;
3117         }
3118
3119         /* Fill in the new block's btree header and log it. */
3120         xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3121         xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3122         ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3123                         !xfs_btree_ptr_is_null(cur, &rptr));
3124
3125         /* Fill in the key data in the new root. */
3126         if (xfs_btree_get_level(left) > 0) {
3127                 /*
3128                  * Get the keys for the left block's keys and put them directly
3129                  * in the parent block.  Do the same for the right block.
3130                  */
3131                 xfs_btree_get_node_keys(cur, left,
3132                                 xfs_btree_key_addr(cur, 1, new));
3133                 xfs_btree_get_node_keys(cur, right,
3134                                 xfs_btree_key_addr(cur, 2, new));
3135         } else {
3136                 /*
3137                  * Get the keys for the left block's records and put them
3138                  * directly in the parent block.  Do the same for the right
3139                  * block.
3140                  */
3141                 xfs_btree_get_leaf_keys(cur, left,
3142                         xfs_btree_key_addr(cur, 1, new));
3143                 xfs_btree_get_leaf_keys(cur, right,
3144                         xfs_btree_key_addr(cur, 2, new));
3145         }
3146         xfs_btree_log_keys(cur, nbp, 1, 2);
3147
3148         /* Fill in the pointer data in the new root. */
3149         xfs_btree_copy_ptrs(cur,
3150                 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3151         xfs_btree_copy_ptrs(cur,
3152                 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3153         xfs_btree_log_ptrs(cur, nbp, 1, 2);
3154
3155         /* Fix up the cursor. */
3156         xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3157         cur->bc_ptrs[cur->bc_nlevels] = nptr;
3158         cur->bc_nlevels++;
3159         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3160         *stat = 1;
3161         return 0;
3162 error0:
3163         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3164         return error;
3165 out0:
3166         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3167         *stat = 0;
3168         return 0;
3169 }
3170
3171 STATIC int
3172 xfs_btree_make_block_unfull(
3173         struct xfs_btree_cur    *cur,   /* btree cursor */
3174         int                     level,  /* btree level */
3175         int                     numrecs,/* # of recs in block */
3176         int                     *oindex,/* old tree index */
3177         int                     *index, /* new tree index */
3178         union xfs_btree_ptr     *nptr,  /* new btree ptr */
3179         struct xfs_btree_cur    **ncur, /* new btree cursor */
3180         union xfs_btree_key     *key,   /* key of new block */
3181         int                     *stat)
3182 {
3183         int                     error = 0;
3184
3185         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3186             level == cur->bc_nlevels - 1) {
3187                 struct xfs_inode *ip = cur->bc_private.b.ip;
3188
3189                 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3190                         /* A root block that can be made bigger. */
3191                         xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
3192                         *stat = 1;
3193                 } else {
3194                         /* A root block that needs replacing */
3195                         int     logflags = 0;
3196
3197                         error = xfs_btree_new_iroot(cur, &logflags, stat);
3198                         if (error || *stat == 0)
3199                                 return error;
3200
3201                         xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3202                 }
3203
3204                 return 0;
3205         }
3206
3207         /* First, try shifting an entry to the right neighbor. */
3208         error = xfs_btree_rshift(cur, level, stat);
3209         if (error || *stat)
3210                 return error;
3211
3212         /* Next, try shifting an entry to the left neighbor. */
3213         error = xfs_btree_lshift(cur, level, stat);
3214         if (error)
3215                 return error;
3216
3217         if (*stat) {
3218                 *oindex = *index = cur->bc_ptrs[level];
3219                 return 0;
3220         }
3221
3222         /*
3223          * Next, try splitting the current block in half.
3224          *
3225          * If this works we have to re-set our variables because we
3226          * could be in a different block now.
3227          */
3228         error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3229         if (error || *stat == 0)
3230                 return error;
3231
3232
3233         *index = cur->bc_ptrs[level];
3234         return 0;
3235 }
3236
3237 /*
3238  * Insert one record/level.  Return information to the caller
3239  * allowing the next level up to proceed if necessary.
3240  */
3241 STATIC int
3242 xfs_btree_insrec(
3243         struct xfs_btree_cur    *cur,   /* btree cursor */
3244         int                     level,  /* level to insert record at */
3245         union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
3246         union xfs_btree_rec     *rec,   /* record to insert */
3247         union xfs_btree_key     *key,   /* i/o: block key for ptrp */
3248         struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
3249         int                     *stat)  /* success/failure */
3250 {
3251         struct xfs_btree_block  *block; /* btree block */
3252         struct xfs_buf          *bp;    /* buffer for block */
3253         union xfs_btree_ptr     nptr;   /* new block ptr */
3254         struct xfs_btree_cur    *ncur;  /* new btree cursor */
3255         union xfs_btree_key     nkey;   /* new block key */
3256         union xfs_btree_key     *lkey;
3257         int                     optr;   /* old key/record index */
3258         int                     ptr;    /* key/record index */
3259         int                     numrecs;/* number of records */
3260         int                     error;  /* error return value */
3261 #ifdef DEBUG
3262         int                     i;
3263 #endif
3264         xfs_daddr_t             old_bn;
3265
3266         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3267         XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec);
3268
3269         ncur = NULL;
3270         lkey = &nkey;
3271
3272         /*
3273          * If we have an external root pointer, and we've made it to the
3274          * root level, allocate a new root block and we're done.
3275          */
3276         if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3277             (level >= cur->bc_nlevels)) {
3278                 error = xfs_btree_new_root(cur, stat);
3279                 xfs_btree_set_ptr_null(cur, ptrp);
3280
3281                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3282                 return error;
3283         }
3284
3285         /* If we're off the left edge, return failure. */
3286         ptr = cur->bc_ptrs[level];
3287         if (ptr == 0) {
3288                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3289                 *stat = 0;
3290                 return 0;
3291         }
3292
3293         optr = ptr;
3294
3295         XFS_BTREE_STATS_INC(cur, insrec);
3296
3297         /* Get pointers to the btree buffer and block. */
3298         block = xfs_btree_get_block(cur, level, &bp);
3299         old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
3300         numrecs = xfs_btree_get_numrecs(block);
3301
3302 #ifdef DEBUG
3303         error = xfs_btree_check_block(cur, block, level, bp);
3304         if (error)
3305                 goto error0;
3306
3307         /* Check that the new entry is being inserted in the right place. */
3308         if (ptr <= numrecs) {
3309                 if (level == 0) {
3310                         ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3311                                 xfs_btree_rec_addr(cur, ptr, block)));
3312                 } else {
3313                         ASSERT(cur->bc_ops->keys_inorder(cur, key,
3314                                 xfs_btree_key_addr(cur, ptr, block)));
3315                 }
3316         }
3317 #endif
3318
3319         /*
3320          * If the block is full, we can't insert the new entry until we
3321          * make the block un-full.
3322          */
3323         xfs_btree_set_ptr_null(cur, &nptr);
3324         if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3325                 error = xfs_btree_make_block_unfull(cur, level, numrecs,
3326                                         &optr, &ptr, &nptr, &ncur, lkey, stat);
3327                 if (error || *stat == 0)
3328                         goto error0;
3329         }
3330
3331         /*
3332          * The current block may have changed if the block was
3333          * previously full and we have just made space in it.
3334          */
3335         block = xfs_btree_get_block(cur, level, &bp);
3336         numrecs = xfs_btree_get_numrecs(block);
3337
3338 #ifdef DEBUG
3339         error = xfs_btree_check_block(cur, block, level, bp);
3340         if (error)
3341                 return error;
3342 #endif
3343
3344         /*
3345          * At this point we know there's room for our new entry in the block
3346          * we're pointing at.
3347          */
3348         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3349
3350         if (level > 0) {
3351                 /* It's a nonleaf. make a hole in the keys and ptrs */
3352                 union xfs_btree_key     *kp;
3353                 union xfs_btree_ptr     *pp;
3354
3355                 kp = xfs_btree_key_addr(cur, ptr, block);
3356                 pp = xfs_btree_ptr_addr(cur, ptr, block);
3357
3358 #ifdef DEBUG
3359                 for (i = numrecs - ptr; i >= 0; i--) {
3360                         error = xfs_btree_check_ptr(cur, pp, i, level);
3361                         if (error)
3362                                 return error;
3363                 }
3364 #endif
3365
3366                 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3367                 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3368
3369 #ifdef DEBUG
3370                 error = xfs_btree_check_ptr(cur, ptrp, 0, level);
3371                 if (error)
3372                         goto error0;
3373 #endif
3374
3375                 /* Now put the new data in, bump numrecs and log it. */
3376                 xfs_btree_copy_keys(cur, kp, key, 1);
3377                 xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3378                 numrecs++;
3379                 xfs_btree_set_numrecs(block, numrecs);
3380                 xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3381                 xfs_btree_log_keys(cur, bp, ptr, numrecs);
3382 #ifdef DEBUG
3383                 if (ptr < numrecs) {
3384                         ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3385                                 xfs_btree_key_addr(cur, ptr + 1, block)));
3386                 }
3387 #endif
3388         } else {
3389                 /* It's a leaf. make a hole in the records */
3390                 union xfs_btree_rec             *rp;
3391
3392                 rp = xfs_btree_rec_addr(cur, ptr, block);
3393
3394                 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3395
3396                 /* Now put the new data in, bump numrecs and log it. */
3397                 xfs_btree_copy_recs(cur, rp, rec, 1);
3398                 xfs_btree_set_numrecs(block, ++numrecs);
3399                 xfs_btree_log_recs(cur, bp, ptr, numrecs);
3400 #ifdef DEBUG
3401                 if (ptr < numrecs) {
3402                         ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3403                                 xfs_btree_rec_addr(cur, ptr + 1, block)));
3404                 }
3405 #endif
3406         }
3407
3408         /* Log the new number of records in the btree header. */
3409         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3410
3411         /*
3412          * If we just inserted into a new tree block, we have to
3413          * recalculate nkey here because nkey is out of date.
3414          *
3415          * Otherwise we're just updating an existing block (having shoved
3416          * some records into the new tree block), so use the regular key
3417          * update mechanism.
3418          */
3419         if (bp && bp->b_bn != old_bn) {
3420                 xfs_btree_get_keys(cur, block, lkey);
3421         } else if (xfs_btree_needs_key_update(cur, optr)) {
3422                 error = xfs_btree_update_keys(cur, level);
3423                 if (error)
3424                         goto error0;
3425         }
3426
3427         /*
3428          * If we are tracking the last record in the tree and
3429          * we are at the far right edge of the tree, update it.
3430          */
3431         if (xfs_btree_is_lastrec(cur, block, level)) {
3432                 cur->bc_ops->update_lastrec(cur, block, rec,
3433                                             ptr, LASTREC_INSREC);
3434         }
3435
3436         /*
3437          * Return the new block number, if any.
3438          * If there is one, give back a record value and a cursor too.
3439          */
3440         *ptrp = nptr;
3441         if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3442                 xfs_btree_copy_keys(cur, key, lkey, 1);
3443                 *curp = ncur;
3444         }
3445
3446         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3447         *stat = 1;
3448         return 0;
3449
3450 error0:
3451         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3452         return error;
3453 }
3454
3455 /*
3456  * Insert the record at the point referenced by cur.
3457  *
3458  * A multi-level split of the tree on insert will invalidate the original
3459  * cursor.  All callers of this function should assume that the cursor is
3460  * no longer valid and revalidate it.
3461  */
3462 int
3463 xfs_btree_insert(
3464         struct xfs_btree_cur    *cur,
3465         int                     *stat)
3466 {
3467         int                     error;  /* error return value */
3468         int                     i;      /* result value, 0 for failure */
3469         int                     level;  /* current level number in btree */
3470         union xfs_btree_ptr     nptr;   /* new block number (split result) */
3471         struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3472         struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3473         union xfs_btree_key     bkey;   /* key of block to insert */
3474         union xfs_btree_key     *key;
3475         union xfs_btree_rec     rec;    /* record to insert */
3476
3477         level = 0;
3478         ncur = NULL;
3479         pcur = cur;
3480         key = &bkey;
3481
3482         xfs_btree_set_ptr_null(cur, &nptr);
3483
3484         /* Make a key out of the record data to be inserted, and save it. */
3485         cur->bc_ops->init_rec_from_cur(cur, &rec);
3486         cur->bc_ops->init_key_from_rec(key, &rec);
3487
3488         /*
3489          * Loop going up the tree, starting at the leaf level.
3490          * Stop when we don't get a split block, that must mean that
3491          * the insert is finished with this level.
3492          */
3493         do {
3494                 /*
3495                  * Insert nrec/nptr into this level of the tree.
3496                  * Note if we fail, nptr will be null.
3497                  */
3498                 error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3499                                 &ncur, &i);
3500                 if (error) {
3501                         if (pcur != cur)
3502                                 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3503                         goto error0;
3504                 }
3505
3506                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3507                 level++;
3508
3509                 /*
3510                  * See if the cursor we just used is trash.
3511                  * Can't trash the caller's cursor, but otherwise we should
3512                  * if ncur is a new cursor or we're about to be done.
3513                  */
3514                 if (pcur != cur &&
3515                     (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3516                         /* Save the state from the cursor before we trash it */
3517                         if (cur->bc_ops->update_cursor)
3518                                 cur->bc_ops->update_cursor(pcur, cur);
3519                         cur->bc_nlevels = pcur->bc_nlevels;
3520                         xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3521                 }
3522                 /* If we got a new cursor, switch to it. */
3523                 if (ncur) {
3524                         pcur = ncur;
3525                         ncur = NULL;
3526                 }
3527         } while (!xfs_btree_ptr_is_null(cur, &nptr));
3528
3529         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3530         *stat = i;
3531         return 0;
3532 error0:
3533         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3534         return error;
3535 }
3536
3537 /*
3538  * Try to merge a non-leaf block back into the inode root.
3539  *
3540  * Note: the killroot names comes from the fact that we're effectively
3541  * killing the old root block.  But because we can't just delete the
3542  * inode we have to copy the single block it was pointing to into the
3543  * inode.
3544  */
3545 STATIC int
3546 xfs_btree_kill_iroot(
3547         struct xfs_btree_cur    *cur)
3548 {
3549         int                     whichfork = cur->bc_private.b.whichfork;
3550         struct xfs_inode        *ip = cur->bc_private.b.ip;
3551         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3552         struct xfs_btree_block  *block;
3553         struct xfs_btree_block  *cblock;
3554         union xfs_btree_key     *kp;
3555         union xfs_btree_key     *ckp;
3556         union xfs_btree_ptr     *pp;
3557         union xfs_btree_ptr     *cpp;
3558         struct xfs_buf          *cbp;
3559         int                     level;
3560         int                     index;
3561         int                     numrecs;
3562         int                     error;
3563 #ifdef DEBUG
3564         union xfs_btree_ptr     ptr;
3565         int                     i;
3566 #endif
3567
3568         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3569
3570         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3571         ASSERT(cur->bc_nlevels > 1);
3572
3573         /*
3574          * Don't deal with the root block needs to be a leaf case.
3575          * We're just going to turn the thing back into extents anyway.
3576          */
3577         level = cur->bc_nlevels - 1;
3578         if (level == 1)
3579                 goto out0;
3580
3581         /*
3582          * Give up if the root has multiple children.
3583          */
3584         block = xfs_btree_get_iroot(cur);
3585         if (xfs_btree_get_numrecs(block) != 1)
3586                 goto out0;
3587
3588         cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3589         numrecs = xfs_btree_get_numrecs(cblock);
3590
3591         /*
3592          * Only do this if the next level will fit.
3593          * Then the data must be copied up to the inode,
3594          * instead of freeing the root you free the next level.
3595          */
3596         if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3597                 goto out0;
3598
3599         XFS_BTREE_STATS_INC(cur, killroot);
3600
3601 #ifdef DEBUG
3602         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3603         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3604         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3605         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3606 #endif
3607
3608         index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3609         if (index) {
3610                 xfs_iroot_realloc(cur->bc_private.b.ip, index,
3611                                   cur->bc_private.b.whichfork);
3612                 block = ifp->if_broot;
3613         }
3614
3615         be16_add_cpu(&block->bb_numrecs, index);
3616         ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3617
3618         kp = xfs_btree_key_addr(cur, 1, block);
3619         ckp = xfs_btree_key_addr(cur, 1, cblock);
3620         xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3621
3622         pp = xfs_btree_ptr_addr(cur, 1, block);
3623         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3624 #ifdef DEBUG
3625         for (i = 0; i < numrecs; i++) {
3626                 error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3627                 if (error) {
3628                         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3629                         return error;
3630                 }
3631         }
3632 #endif
3633         xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3634
3635         error = xfs_btree_free_block(cur, cbp);
3636         if (error) {
3637                 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3638                 return error;
3639         }
3640
3641         cur->bc_bufs[level - 1] = NULL;
3642         be16_add_cpu(&block->bb_level, -1);
3643         xfs_trans_log_inode(cur->bc_tp, ip,
3644                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3645         cur->bc_nlevels--;
3646 out0:
3647         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3648         return 0;
3649 }
3650
3651 /*
3652  * Kill the current root node, and replace it with it's only child node.
3653  */
3654 STATIC int
3655 xfs_btree_kill_root(
3656         struct xfs_btree_cur    *cur,
3657         struct xfs_buf          *bp,
3658         int                     level,
3659         union xfs_btree_ptr     *newroot)
3660 {
3661         int                     error;
3662
3663         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3664         XFS_BTREE_STATS_INC(cur, killroot);
3665
3666         /*
3667          * Update the root pointer, decreasing the level by 1 and then
3668          * free the old root.
3669          */
3670         cur->bc_ops->set_root(cur, newroot, -1);
3671
3672         error = xfs_btree_free_block(cur, bp);
3673         if (error) {
3674                 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3675                 return error;
3676         }
3677
3678         cur->bc_bufs[level] = NULL;
3679         cur->bc_ra[level] = 0;
3680         cur->bc_nlevels--;
3681
3682         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3683         return 0;
3684 }
3685
3686 STATIC int
3687 xfs_btree_dec_cursor(
3688         struct xfs_btree_cur    *cur,
3689         int                     level,
3690         int                     *stat)
3691 {
3692         int                     error;
3693         int                     i;
3694
3695         if (level > 0) {
3696                 error = xfs_btree_decrement(cur, level, &i);
3697                 if (error)
3698                         return error;
3699         }
3700
3701         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3702         *stat = 1;
3703         return 0;
3704 }
3705
3706 /*
3707  * Single level of the btree record deletion routine.
3708  * Delete record pointed to by cur/level.
3709  * Remove the record from its block then rebalance the tree.
3710  * Return 0 for error, 1 for done, 2 to go on to the next level.
3711  */
3712 STATIC int                                      /* error */
3713 xfs_btree_delrec(
3714         struct xfs_btree_cur    *cur,           /* btree cursor */
3715         int                     level,          /* level removing record from */
3716         int                     *stat)          /* fail/done/go-on */
3717 {
3718         struct xfs_btree_block  *block;         /* btree block */
3719         union xfs_btree_ptr     cptr;           /* current block ptr */
3720         struct xfs_buf          *bp;            /* buffer for block */
3721         int                     error;          /* error return value */
3722         int                     i;              /* loop counter */
3723         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3724         struct xfs_buf          *lbp;           /* left buffer pointer */
3725         struct xfs_btree_block  *left;          /* left btree block */
3726         int                     lrecs = 0;      /* left record count */
3727         int                     ptr;            /* key/record index */
3728         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3729         struct xfs_buf          *rbp;           /* right buffer pointer */
3730         struct xfs_btree_block  *right;         /* right btree block */
3731         struct xfs_btree_block  *rrblock;       /* right-right btree block */
3732         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3733         int                     rrecs = 0;      /* right record count */
3734         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3735         int                     numrecs;        /* temporary numrec count */
3736
3737         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3738         XFS_BTREE_TRACE_ARGI(cur, level);
3739
3740         tcur = NULL;
3741
3742         /* Get the index of the entry being deleted, check for nothing there. */
3743         ptr = cur->bc_ptrs[level];
3744         if (ptr == 0) {
3745                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3746                 *stat = 0;
3747                 return 0;
3748         }
3749
3750         /* Get the buffer & block containing the record or key/ptr. */
3751         block = xfs_btree_get_block(cur, level, &bp);
3752         numrecs = xfs_btree_get_numrecs(block);
3753
3754 #ifdef DEBUG
3755         error = xfs_btree_check_block(cur, block, level, bp);
3756         if (error)
3757                 goto error0;
3758 #endif
3759
3760         /* Fail if we're off the end of the block. */
3761         if (ptr > numrecs) {
3762                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3763                 *stat = 0;
3764                 return 0;
3765         }
3766
3767         XFS_BTREE_STATS_INC(cur, delrec);
3768         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3769
3770         /* Excise the entries being deleted. */
3771         if (level > 0) {
3772                 /* It's a nonleaf. operate on keys and ptrs */
3773                 union xfs_btree_key     *lkp;
3774                 union xfs_btree_ptr     *lpp;
3775
3776                 lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3777                 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3778
3779 #ifdef DEBUG
3780                 for (i = 0; i < numrecs - ptr; i++) {
3781                         error = xfs_btree_check_ptr(cur, lpp, i, level);
3782                         if (error)
3783                                 goto error0;
3784                 }
3785 #endif
3786
3787                 if (ptr < numrecs) {
3788                         xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3789                         xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3790                         xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3791                         xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3792                 }
3793         } else {
3794                 /* It's a leaf. operate on records */
3795                 if (ptr < numrecs) {
3796                         xfs_btree_shift_recs(cur,
3797                                 xfs_btree_rec_addr(cur, ptr + 1, block),
3798                                 -1, numrecs - ptr);
3799                         xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3800                 }
3801         }
3802
3803         /*
3804          * Decrement and log the number of entries in the block.
3805          */
3806         xfs_btree_set_numrecs(block, --numrecs);
3807         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3808
3809         /*
3810          * If we are tracking the last record in the tree and
3811          * we are at the far right edge of the tree, update it.
3812          */
3813         if (xfs_btree_is_lastrec(cur, block, level)) {
3814                 cur->bc_ops->update_lastrec(cur, block, NULL,
3815                                             ptr, LASTREC_DELREC);
3816         }
3817
3818         /*
3819          * We're at the root level.  First, shrink the root block in-memory.
3820          * Try to get rid of the next level down.  If we can't then there's
3821          * nothing left to do.
3822          */
3823         if (level == cur->bc_nlevels - 1) {
3824                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3825                         xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3826                                           cur->bc_private.b.whichfork);
3827
3828                         error = xfs_btree_kill_iroot(cur);
3829                         if (error)
3830                                 goto error0;
3831
3832                         error = xfs_btree_dec_cursor(cur, level, stat);
3833                         if (error)
3834                                 goto error0;
3835                         *stat = 1;
3836                         return 0;
3837                 }
3838
3839                 /*
3840                  * If this is the root level, and there's only one entry left,
3841                  * and it's NOT the leaf level, then we can get rid of this
3842                  * level.
3843                  */
3844                 if (numrecs == 1 && level > 0) {
3845                         union xfs_btree_ptr     *pp;
3846                         /*
3847                          * pp is still set to the first pointer in the block.
3848                          * Make it the new root of the btree.
3849                          */
3850                         pp = xfs_btree_ptr_addr(cur, 1, block);
3851                         error = xfs_btree_kill_root(cur, bp, level, pp);
3852                         if (error)
3853                                 goto error0;
3854                 } else if (level > 0) {
3855                         error = xfs_btree_dec_cursor(cur, level, stat);
3856                         if (error)
3857                                 goto error0;
3858                 }
3859                 *stat = 1;
3860                 return 0;
3861         }
3862
3863         /*
3864          * If we deleted the leftmost entry in the block, update the
3865          * key values above us in the tree.
3866          */
3867         if (xfs_btree_needs_key_update(cur, ptr)) {
3868                 error = xfs_btree_update_keys(cur, level);
3869                 if (error)
3870                         goto error0;
3871         }
3872
3873         /*
3874          * If the number of records remaining in the block is at least
3875          * the minimum, we're done.
3876          */
3877         if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3878                 error = xfs_btree_dec_cursor(cur, level, stat);
3879                 if (error)
3880                         goto error0;
3881                 return 0;
3882         }
3883
3884         /*
3885          * Otherwise, we have to move some records around to keep the
3886          * tree balanced.  Look at the left and right sibling blocks to
3887          * see if we can re-balance by moving only one record.
3888          */
3889         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3890         xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3891
3892         if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3893                 /*
3894                  * One child of root, need to get a chance to copy its contents
3895                  * into the root and delete it. Can't go up to next level,
3896                  * there's nothing to delete there.
3897                  */
3898                 if (xfs_btree_ptr_is_null(cur, &rptr) &&
3899                     xfs_btree_ptr_is_null(cur, &lptr) &&
3900                     level == cur->bc_nlevels - 2) {
3901                         error = xfs_btree_kill_iroot(cur);
3902                         if (!error)
3903                                 error = xfs_btree_dec_cursor(cur, level, stat);
3904                         if (error)
3905                                 goto error0;
3906                         return 0;
3907                 }
3908         }
3909
3910         ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3911                !xfs_btree_ptr_is_null(cur, &lptr));
3912
3913         /*
3914          * Duplicate the cursor so our btree manipulations here won't
3915          * disrupt the next level up.
3916          */
3917         error = xfs_btree_dup_cursor(cur, &tcur);
3918         if (error)
3919                 goto error0;
3920
3921         /*
3922          * If there's a right sibling, see if it's ok to shift an entry
3923          * out of it.
3924          */
3925         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3926                 /*
3927                  * Move the temp cursor to the last entry in the next block.
3928                  * Actually any entry but the first would suffice.
3929                  */
3930                 i = xfs_btree_lastrec(tcur, level);
3931                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3932
3933                 error = xfs_btree_increment(tcur, level, &i);
3934                 if (error)
3935                         goto error0;
3936                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3937
3938                 i = xfs_btree_lastrec(tcur, level);
3939                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3940
3941                 /* Grab a pointer to the block. */
3942                 right = xfs_btree_get_block(tcur, level, &rbp);
3943 #ifdef DEBUG
3944                 error = xfs_btree_check_block(tcur, right, level, rbp);
3945                 if (error)
3946                         goto error0;
3947 #endif
3948                 /* Grab the current block number, for future use. */
3949                 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3950
3951                 /*
3952                  * If right block is full enough so that removing one entry
3953                  * won't make it too empty, and left-shifting an entry out
3954                  * of right to us works, we're done.
3955                  */
3956                 if (xfs_btree_get_numrecs(right) - 1 >=
3957                     cur->bc_ops->get_minrecs(tcur, level)) {
3958                         error = xfs_btree_lshift(tcur, level, &i);
3959                         if (error)
3960                                 goto error0;
3961                         if (i) {
3962                                 ASSERT(xfs_btree_get_numrecs(block) >=
3963                                        cur->bc_ops->get_minrecs(tcur, level));
3964
3965                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3966                                 tcur = NULL;
3967
3968                                 error = xfs_btree_dec_cursor(cur, level, stat);
3969                                 if (error)
3970                                         goto error0;
3971                                 return 0;
3972                         }
3973                 }
3974
3975                 /*
3976                  * Otherwise, grab the number of records in right for
3977                  * future reference, and fix up the temp cursor to point
3978                  * to our block again (last record).
3979                  */
3980                 rrecs = xfs_btree_get_numrecs(right);
3981                 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3982                         i = xfs_btree_firstrec(tcur, level);
3983                         XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3984
3985                         error = xfs_btree_decrement(tcur, level, &i);
3986                         if (error)
3987                                 goto error0;
3988                         XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3989                 }
3990         }
3991
3992         /*
3993          * If there's a left sibling, see if it's ok to shift an entry
3994          * out of it.
3995          */
3996         if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3997                 /*
3998                  * Move the temp cursor to the first entry in the
3999                  * previous block.
4000                  */
4001                 i = xfs_btree_firstrec(tcur, level);
4002                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
4003
4004                 error = xfs_btree_decrement(tcur, level, &i);
4005                 if (error)
4006                         goto error0;
4007                 i = xfs_btree_firstrec(tcur, level);
4008                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
4009
4010                 /* Grab a pointer to the block. */
4011                 left = xfs_btree_get_block(tcur, level, &lbp);
4012 #ifdef DEBUG
4013                 error = xfs_btree_check_block(cur, left, level, lbp);
4014                 if (error)
4015                         goto error0;
4016 #endif
4017                 /* Grab the current block number, for future use. */
4018                 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
4019
4020                 /*
4021                  * If left block is full enough so that removing one entry
4022                  * won't make it too empty, and right-shifting an entry out
4023                  * of left to us works, we're done.
4024                  */
4025                 if (xfs_btree_get_numrecs(left) - 1 >=
4026                     cur->bc_ops->get_minrecs(tcur, level)) {
4027                         error = xfs_btree_rshift(tcur, level, &i);
4028                         if (error)
4029                                 goto error0;
4030                         if (i) {
4031                                 ASSERT(xfs_btree_get_numrecs(block) >=
4032                                        cur->bc_ops->get_minrecs(tcur, level));
4033                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
4034                                 tcur = NULL;
4035                                 if (level == 0)
4036                                         cur->bc_ptrs[0]++;
4037                                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
4038                                 *stat = 1;
4039                                 return 0;
4040                         }
4041                 }
4042
4043                 /*
4044                  * Otherwise, grab the number of records in right for
4045                  * future reference.
4046                  */
4047                 lrecs = xfs_btree_get_numrecs(left);
4048         }
4049
4050         /* Delete the temp cursor, we're done with it. */
4051         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
4052         tcur = NULL;
4053
4054         /* If here, we need to do a join to keep the tree balanced. */
4055         ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
4056
4057         if (!xfs_btree_ptr_is_null(cur, &lptr) &&
4058             lrecs + xfs_btree_get_numrecs(block) <=
4059                         cur->bc_ops->get_maxrecs(cur, level)) {
4060                 /*
4061                  * Set "right" to be the starting block,
4062                  * "left" to be the left neighbor.
4063                  */
4064                 rptr = cptr;
4065                 right = block;
4066                 rbp = bp;
4067                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
4068                 if (error)
4069                         goto error0;
4070
4071         /*
4072          * If that won't work, see if we can join with the right neighbor block.
4073          */
4074         } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
4075                    rrecs + xfs_btree_get_numrecs(block) <=
4076                         cur->bc_ops->get_maxrecs(cur, level)) {
4077                 /*
4078                  * Set "left" to be the starting block,
4079                  * "right" to be the right neighbor.
4080                  */
4081                 lptr = cptr;
4082                 left = block;
4083                 lbp = bp;
4084                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4085                 if (error)
4086                         goto error0;
4087
4088         /*
4089          * Otherwise, we can't fix the imbalance.
4090          * Just return.  This is probably a logic error, but it's not fatal.
4091          */
4092         } else {
4093                 error = xfs_btree_dec_cursor(cur, level, stat);
4094                 if (error)
4095                         goto error0;
4096                 return 0;
4097         }
4098
4099         rrecs = xfs_btree_get_numrecs(right);
4100         lrecs = xfs_btree_get_numrecs(left);
4101
4102         /*
4103          * We're now going to join "left" and "right" by moving all the stuff
4104          * in "right" to "left" and deleting "right".
4105          */
4106         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4107         if (level > 0) {
4108                 /* It's a non-leaf.  Move keys and pointers. */
4109                 union xfs_btree_key     *lkp;   /* left btree key */
4110                 union xfs_btree_ptr     *lpp;   /* left address pointer */
4111                 union xfs_btree_key     *rkp;   /* right btree key */
4112                 union xfs_btree_ptr     *rpp;   /* right address pointer */
4113
4114                 lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4115                 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4116                 rkp = xfs_btree_key_addr(cur, 1, right);
4117                 rpp = xfs_btree_ptr_addr(cur, 1, right);
4118 #ifdef DEBUG
4119                 for (i = 1; i < rrecs; i++) {
4120                         error = xfs_btree_check_ptr(cur, rpp, i, level);
4121                         if (error)
4122                                 goto error0;
4123                 }
4124 #endif
4125                 xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4126                 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4127
4128                 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4129                 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4130         } else {
4131                 /* It's a leaf.  Move records.  */
4132                 union xfs_btree_rec     *lrp;   /* left record pointer */
4133                 union xfs_btree_rec     *rrp;   /* right record pointer */
4134
4135                 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4136                 rrp = xfs_btree_rec_addr(cur, 1, right);
4137
4138                 xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4139                 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4140         }
4141
4142         XFS_BTREE_STATS_INC(cur, join);
4143
4144         /*
4145          * Fix up the number of records and right block pointer in the
4146          * surviving block, and log it.
4147          */
4148         xfs_btree_set_numrecs(left, lrecs + rrecs);
4149         xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
4150         xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4151         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4152
4153         /* If there is a right sibling, point it to the remaining block. */
4154         xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4155         if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4156                 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4157                 if (error)
4158                         goto error0;
4159                 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4160                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4161         }
4162
4163         /* Free the deleted block. */
4164         error = xfs_btree_free_block(cur, rbp);
4165         if (error)
4166                 goto error0;
4167
4168         /*
4169          * If we joined with the left neighbor, set the buffer in the
4170          * cursor to the left block, and fix up the index.
4171          */
4172         if (bp != lbp) {
4173                 cur->bc_bufs[level] = lbp;
4174                 cur->bc_ptrs[level] += lrecs;
4175                 cur->bc_ra[level] = 0;
4176         }
4177         /*
4178          * If we joined with the right neighbor and there's a level above
4179          * us, increment the cursor at that level.
4180          */
4181         else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
4182                    (level + 1 < cur->bc_nlevels)) {
4183                 error = xfs_btree_increment(cur, level + 1, &i);
4184                 if (error)
4185                         goto error0;
4186         }
4187
4188         /*
4189          * Readjust the ptr at this level if it's not a leaf, since it's
4190          * still pointing at the deletion point, which makes the cursor
4191          * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4192          * We can't use decrement because it would change the next level up.
4193          */
4194         if (level > 0)
4195                 cur->bc_ptrs[level]--;
4196
4197         /*
4198          * We combined blocks, so we have to update the parent keys if the
4199          * btree supports overlapped intervals.  However, bc_ptrs[level + 1]
4200          * points to the old block so that the caller knows which record to
4201          * delete.  Therefore, the caller must be savvy enough to call updkeys
4202          * for us if we return stat == 2.  The other exit points from this
4203          * function don't require deletions further up the tree, so they can
4204          * call updkeys directly.
4205          */
4206
4207         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
4208         /* Return value means the next level up has something to do. */
4209         *stat = 2;
4210         return 0;
4211
4212 error0:
4213         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
4214         if (tcur)
4215                 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4216         return error;
4217 }
4218
4219 /*
4220  * Delete the record pointed to by cur.
4221  * The cursor refers to the place where the record was (could be inserted)
4222  * when the operation returns.
4223  */
4224 int                                     /* error */
4225 xfs_btree_delete(
4226         struct xfs_btree_cur    *cur,
4227         int                     *stat)  /* success/failure */
4228 {
4229         int                     error;  /* error return value */
4230         int                     level;
4231         int                     i;
4232         bool                    joined = false;
4233
4234         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
4235
4236         /*
4237          * Go up the tree, starting at leaf level.
4238          *
4239          * If 2 is returned then a join was done; go to the next level.
4240          * Otherwise we are done.
4241          */
4242         for (level = 0, i = 2; i == 2; level++) {
4243                 error = xfs_btree_delrec(cur, level, &i);
4244                 if (error)
4245                         goto error0;
4246                 if (i == 2)
4247                         joined = true;
4248         }
4249
4250         /*
4251          * If we combined blocks as part of deleting the record, delrec won't
4252          * have updated the parent high keys so we have to do that here.
4253          */
4254         if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
4255                 error = xfs_btree_updkeys_force(cur, 0);
4256                 if (error)
4257                         goto error0;
4258         }
4259
4260         if (i == 0) {
4261                 for (level = 1; level < cur->bc_nlevels; level++) {
4262                         if (cur->bc_ptrs[level] == 0) {
4263                                 error = xfs_btree_decrement(cur, level, &i);
4264                                 if (error)
4265                                         goto error0;
4266                                 break;
4267                         }
4268                 }
4269         }
4270
4271         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
4272         *stat = i;
4273         return 0;
4274 error0:
4275         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
4276         return error;
4277 }
4278
4279 /*
4280  * Get the data from the pointed-to record.
4281  */
4282 int                                     /* error */
4283 xfs_btree_get_rec(
4284         struct xfs_btree_cur    *cur,   /* btree cursor */
4285         union xfs_btree_rec     **recp, /* output: btree record */
4286         int                     *stat)  /* output: success/failure */
4287 {
4288         struct xfs_btree_block  *block; /* btree block */
4289         struct xfs_buf          *bp;    /* buffer pointer */
4290         int                     ptr;    /* record number */
4291 #ifdef DEBUG
4292         int                     error;  /* error return value */
4293 #endif
4294
4295         ptr = cur->bc_ptrs[0];
4296         block = xfs_btree_get_block(cur, 0, &bp);
4297
4298 #ifdef DEBUG
4299         error = xfs_btree_check_block(cur, block, 0, bp);
4300         if (error)
4301                 return error;
4302 #endif
4303
4304         /*
4305          * Off the right end or left end, return failure.
4306          */
4307         if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4308                 *stat = 0;
4309                 return 0;
4310         }
4311
4312         /*
4313          * Point to the record and extract its data.
4314          */
4315         *recp = xfs_btree_rec_addr(cur, ptr, block);
4316         *stat = 1;
4317         return 0;
4318 }
4319
4320 /* Visit a block in a btree. */
4321 STATIC int
4322 xfs_btree_visit_block(
4323         struct xfs_btree_cur            *cur,
4324         int                             level,
4325         xfs_btree_visit_blocks_fn       fn,
4326         void                            *data)
4327 {
4328         struct xfs_btree_block          *block;
4329         struct xfs_buf                  *bp;
4330         union xfs_btree_ptr             rptr;
4331         int                             error;
4332
4333         /* do right sibling readahead */
4334         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4335         block = xfs_btree_get_block(cur, level, &bp);
4336
4337         /* process the block */
4338         error = fn(cur, level, data);
4339         if (error)
4340                 return error;
4341
4342         /* now read rh sibling block for next iteration */
4343         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4344         if (xfs_btree_ptr_is_null(cur, &rptr))
4345                 return -ENOENT;
4346
4347         return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4348 }
4349
4350
4351 /* Visit every block in a btree. */
4352 int
4353 xfs_btree_visit_blocks(
4354         struct xfs_btree_cur            *cur,
4355         xfs_btree_visit_blocks_fn       fn,
4356         void                            *data)
4357 {
4358         union xfs_btree_ptr             lptr;
4359         int                             level;
4360         struct xfs_btree_block          *block = NULL;
4361         int                             error = 0;
4362
4363         cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4364
4365         /* for each level */
4366         for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4367                 /* grab the left hand block */
4368                 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4369                 if (error)
4370                         return error;
4371
4372                 /* readahead the left most block for the next level down */
4373                 if (level > 0) {
4374                         union xfs_btree_ptr     *ptr;
4375
4376                         ptr = xfs_btree_ptr_addr(cur, 1, block);
4377                         xfs_btree_readahead_ptr(cur, ptr, 1);
4378
4379                         /* save for the next iteration of the loop */
4380                         lptr = *ptr;
4381                 }
4382
4383                 /* for each buffer in the level */
4384                 do {
4385                         error = xfs_btree_visit_block(cur, level, fn, data);
4386                 } while (!error);
4387
4388                 if (error != -ENOENT)
4389                         return error;
4390         }
4391
4392         return 0;
4393 }
4394
4395 /*
4396  * Change the owner of a btree.
4397  *
4398  * The mechanism we use here is ordered buffer logging. Because we don't know
4399  * how many buffers were are going to need to modify, we don't really want to
4400  * have to make transaction reservations for the worst case of every buffer in a
4401  * full size btree as that may be more space that we can fit in the log....
4402  *
4403  * We do the btree walk in the most optimal manner possible - we have sibling
4404  * pointers so we can just walk all the blocks on each level from left to right
4405  * in a single pass, and then move to the next level and do the same. We can
4406  * also do readahead on the sibling pointers to get IO moving more quickly,
4407  * though for slow disks this is unlikely to make much difference to performance
4408  * as the amount of CPU work we have to do before moving to the next block is
4409  * relatively small.
4410  *
4411  * For each btree block that we load, modify the owner appropriately, set the
4412  * buffer as an ordered buffer and log it appropriately. We need to ensure that
4413  * we mark the region we change dirty so that if the buffer is relogged in
4414  * a subsequent transaction the changes we make here as an ordered buffer are
4415  * correctly relogged in that transaction.  If we are in recovery context, then
4416  * just queue the modified buffer as delayed write buffer so the transaction
4417  * recovery completion writes the changes to disk.
4418  */
4419 struct xfs_btree_block_change_owner_info {
4420         __uint64_t              new_owner;
4421         struct list_head        *buffer_list;
4422 };
4423
4424 static int
4425 xfs_btree_block_change_owner(
4426         struct xfs_btree_cur    *cur,
4427         int                     level,
4428         void                    *data)
4429 {
4430         struct xfs_btree_block_change_owner_info        *bbcoi = data;
4431         struct xfs_btree_block  *block;
4432         struct xfs_buf          *bp;
4433
4434         /* modify the owner */
4435         block = xfs_btree_get_block(cur, level, &bp);
4436         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4437                 block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4438         else
4439                 block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4440
4441         /*
4442          * If the block is a root block hosted in an inode, we might not have a
4443          * buffer pointer here and we shouldn't attempt to log the change as the
4444          * information is already held in the inode and discarded when the root
4445          * block is formatted into the on-disk inode fork. We still change it,
4446          * though, so everything is consistent in memory.
4447          */
4448         if (bp) {
4449                 if (cur->bc_tp) {
4450                         xfs_trans_ordered_buf(cur->bc_tp, bp);
4451                         xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4452                 } else {
4453                         xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4454                 }
4455         } else {
4456                 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4457                 ASSERT(level == cur->bc_nlevels - 1);
4458         }
4459
4460         return 0;
4461 }
4462
4463 int
4464 xfs_btree_change_owner(
4465         struct xfs_btree_cur    *cur,
4466         __uint64_t              new_owner,
4467         struct list_head        *buffer_list)
4468 {
4469         struct xfs_btree_block_change_owner_info        bbcoi;
4470
4471         bbcoi.new_owner = new_owner;
4472         bbcoi.buffer_list = buffer_list;
4473
4474         return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4475                         &bbcoi);
4476 }
4477
4478 /**
4479  * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4480  *                                    btree block
4481  *
4482  * @bp: buffer containing the btree block
4483  * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
4484  * @pag_max_level: pointer to the per-ag max level field
4485  */
4486 bool
4487 xfs_btree_sblock_v5hdr_verify(
4488         struct xfs_buf          *bp)
4489 {
4490         struct xfs_mount        *mp = bp->b_target->bt_mount;
4491         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4492         struct xfs_perag        *pag = bp->b_pag;
4493
4494         if (!xfs_sb_version_hascrc(&mp->m_sb))
4495                 return false;
4496         if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4497                 return false;
4498         if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
4499                 return false;
4500         if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4501                 return false;
4502         return true;
4503 }
4504
4505 /**
4506  * xfs_btree_sblock_verify() -- verify a short-format btree block
4507  *
4508  * @bp: buffer containing the btree block
4509  * @max_recs: maximum records allowed in this btree node
4510  */
4511 bool
4512 xfs_btree_sblock_verify(
4513         struct xfs_buf          *bp,
4514         unsigned int            max_recs)
4515 {
4516         struct xfs_mount        *mp = bp->b_target->bt_mount;
4517         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4518
4519         /* numrecs verification */
4520         if (be16_to_cpu(block->bb_numrecs) > max_recs)
4521                 return false;
4522
4523         /* sibling pointer verification */
4524         if (!block->bb_u.s.bb_leftsib ||
4525             (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
4526              block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
4527                 return false;
4528         if (!block->bb_u.s.bb_rightsib ||
4529             (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
4530              block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
4531                 return false;
4532
4533         return true;
4534 }
4535
4536 /*
4537  * Calculate the number of btree levels needed to store a given number of
4538  * records in a short-format btree.
4539  */
4540 uint
4541 xfs_btree_compute_maxlevels(
4542         struct xfs_mount        *mp,
4543         uint                    *limits,
4544         unsigned long           len)
4545 {
4546         uint                    level;
4547         unsigned long           maxblocks;
4548
4549         maxblocks = (len + limits[0] - 1) / limits[0];
4550         for (level = 1; maxblocks > 1; level++)
4551                 maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4552         return level;
4553 }
4554
4555 /*
4556  * Query a regular btree for all records overlapping a given interval.
4557  * Start with a LE lookup of the key of low_rec and return all records
4558  * until we find a record with a key greater than the key of high_rec.
4559  */
4560 STATIC int
4561 xfs_btree_simple_query_range(
4562         struct xfs_btree_cur            *cur,
4563         union xfs_btree_key             *low_key,
4564         union xfs_btree_key             *high_key,
4565         xfs_btree_query_range_fn        fn,
4566         void                            *priv)
4567 {
4568         union xfs_btree_rec             *recp;
4569         union xfs_btree_key             rec_key;
4570         __int64_t                       diff;
4571         int                             stat;
4572         bool                            firstrec = true;
4573         int                             error;
4574
4575         ASSERT(cur->bc_ops->init_high_key_from_rec);
4576         ASSERT(cur->bc_ops->diff_two_keys);
4577
4578         /*
4579          * Find the leftmost record.  The btree cursor must be set
4580          * to the low record used to generate low_key.
4581          */
4582         stat = 0;
4583         error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
4584         if (error)
4585                 goto out;
4586
4587         /* Nothing?  See if there's anything to the right. */
4588         if (!stat) {
4589                 error = xfs_btree_increment(cur, 0, &stat);
4590                 if (error)
4591                         goto out;
4592         }
4593
4594         while (stat) {
4595                 /* Find the record. */
4596                 error = xfs_btree_get_rec(cur, &recp, &stat);
4597                 if (error || !stat)
4598                         break;
4599
4600                 /* Skip if high_key(rec) < low_key. */
4601                 if (firstrec) {
4602                         cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4603                         firstrec = false;
4604                         diff = cur->bc_ops->diff_two_keys(cur, low_key,
4605                                         &rec_key);
4606                         if (diff > 0)
4607                                 goto advloop;
4608                 }
4609
4610                 /* Stop if high_key < low_key(rec). */
4611                 cur->bc_ops->init_key_from_rec(&rec_key, recp);
4612                 diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
4613                 if (diff > 0)
4614                         break;
4615
4616                 /* Callback */
4617                 error = fn(cur, recp, priv);
4618                 if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
4619                         break;
4620
4621 advloop:
4622                 /* Move on to the next record. */
4623                 error = xfs_btree_increment(cur, 0, &stat);
4624                 if (error)
4625                         break;
4626         }
4627
4628 out:
4629         return error;
4630 }
4631
4632 /*
4633  * Query an overlapped interval btree for all records overlapping a given
4634  * interval.  This function roughly follows the algorithm given in
4635  * "Interval Trees" of _Introduction to Algorithms_, which is section
4636  * 14.3 in the 2nd and 3rd editions.
4637  *
4638  * First, generate keys for the low and high records passed in.
4639  *
4640  * For any leaf node, generate the high and low keys for the record.
4641  * If the record keys overlap with the query low/high keys, pass the
4642  * record to the function iterator.
4643  *
4644  * For any internal node, compare the low and high keys of each
4645  * pointer against the query low/high keys.  If there's an overlap,
4646  * follow the pointer.
4647  *
4648  * As an optimization, we stop scanning a block when we find a low key
4649  * that is greater than the query's high key.
4650  */
4651 STATIC int
4652 xfs_btree_overlapped_query_range(
4653         struct xfs_btree_cur            *cur,
4654         union xfs_btree_key             *low_key,
4655         union xfs_btree_key             *high_key,
4656         xfs_btree_query_range_fn        fn,
4657         void                            *priv)
4658 {
4659         union xfs_btree_ptr             ptr;
4660         union xfs_btree_ptr             *pp;
4661         union xfs_btree_key             rec_key;
4662         union xfs_btree_key             rec_hkey;
4663         union xfs_btree_key             *lkp;
4664         union xfs_btree_key             *hkp;
4665         union xfs_btree_rec             *recp;
4666         struct xfs_btree_block          *block;
4667         __int64_t                       ldiff;
4668         __int64_t                       hdiff;
4669         int                             level;
4670         struct xfs_buf                  *bp;
4671         int                             i;
4672         int                             error;
4673
4674         /* Load the root of the btree. */
4675         level = cur->bc_nlevels - 1;
4676         cur->bc_ops->init_ptr_from_cur(cur, &ptr);
4677         error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
4678         if (error)
4679                 return error;
4680         xfs_btree_get_block(cur, level, &bp);
4681         trace_xfs_btree_overlapped_query_range(cur, level, bp);
4682 #ifdef DEBUG
4683         error = xfs_btree_check_block(cur, block, level, bp);
4684         if (error)
4685                 goto out;
4686 #endif
4687         cur->bc_ptrs[level] = 1;
4688
4689         while (level < cur->bc_nlevels) {
4690                 block = xfs_btree_get_block(cur, level, &bp);
4691
4692                 /* End of node, pop back towards the root. */
4693                 if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
4694 pop_up:
4695                         if (level < cur->bc_nlevels - 1)
4696                                 cur->bc_ptrs[level + 1]++;
4697                         level++;
4698                         continue;
4699                 }
4700
4701                 if (level == 0) {
4702                         /* Handle a leaf node. */
4703                         recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
4704
4705                         cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
4706                         ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
4707                                         low_key);
4708
4709                         cur->bc_ops->init_key_from_rec(&rec_key, recp);
4710                         hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
4711                                         &rec_key);
4712
4713                         /*
4714                          * If (record's high key >= query's low key) and
4715                          *    (query's high key >= record's low key), then
4716                          * this record overlaps the query range; callback.
4717                          */
4718                         if (ldiff >= 0 && hdiff >= 0) {
4719                                 error = fn(cur, recp, priv);
4720                                 if (error < 0 ||
4721                                     error == XFS_BTREE_QUERY_RANGE_ABORT)
4722                                         break;
4723                         } else if (hdiff < 0) {
4724                                 /* Record is larger than high key; pop. */
4725                                 goto pop_up;
4726                         }
4727                         cur->bc_ptrs[level]++;
4728                         continue;
4729                 }
4730
4731                 /* Handle an internal node. */
4732                 lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
4733                 hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
4734                 pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
4735
4736                 ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
4737                 hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
4738
4739                 /*
4740                  * If (pointer's high key >= query's low key) and
4741                  *    (query's high key >= pointer's low key), then
4742                  * this record overlaps the query range; follow pointer.
4743                  */
4744                 if (ldiff >= 0 && hdiff >= 0) {
4745                         level--;
4746                         error = xfs_btree_lookup_get_block(cur, level, pp,
4747                                         &block);
4748                         if (error)
4749                                 goto out;
4750                         xfs_btree_get_block(cur, level, &bp);
4751                         trace_xfs_btree_overlapped_query_range(cur, level, bp);
4752 #ifdef DEBUG
4753                         error = xfs_btree_check_block(cur, block, level, bp);
4754                         if (error)
4755                                 goto out;
4756 #endif
4757                         cur->bc_ptrs[level] = 1;
4758                         continue;
4759                 } else if (hdiff < 0) {
4760                         /* The low key is larger than the upper range; pop. */
4761                         goto pop_up;
4762                 }
4763                 cur->bc_ptrs[level]++;
4764         }
4765
4766 out:
4767         /*
4768          * If we don't end this function with the cursor pointing at a record
4769          * block, a subsequent non-error cursor deletion will not release
4770          * node-level buffers, causing a buffer leak.  This is quite possible
4771          * with a zero-results range query, so release the buffers if we
4772          * failed to return any results.
4773          */
4774         if (cur->bc_bufs[0] == NULL) {
4775                 for (i = 0; i < cur->bc_nlevels; i++) {
4776                         if (cur->bc_bufs[i]) {
4777                                 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
4778                                 cur->bc_bufs[i] = NULL;
4779                                 cur->bc_ptrs[i] = 0;
4780                                 cur->bc_ra[i] = 0;
4781                         }
4782                 }
4783         }
4784
4785         return error;
4786 }
4787
4788 /*
4789  * Query a btree for all records overlapping a given interval of keys.  The
4790  * supplied function will be called with each record found; return one of the
4791  * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4792  * code.  This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
4793  * negative error code.
4794  */
4795 int
4796 xfs_btree_query_range(
4797         struct xfs_btree_cur            *cur,
4798         union xfs_btree_irec            *low_rec,
4799         union xfs_btree_irec            *high_rec,
4800         xfs_btree_query_range_fn        fn,
4801         void                            *priv)
4802 {
4803         union xfs_btree_rec             rec;
4804         union xfs_btree_key             low_key;
4805         union xfs_btree_key             high_key;
4806
4807         /* Find the keys of both ends of the interval. */
4808         cur->bc_rec = *high_rec;
4809         cur->bc_ops->init_rec_from_cur(cur, &rec);
4810         cur->bc_ops->init_key_from_rec(&high_key, &rec);
4811
4812         cur->bc_rec = *low_rec;
4813         cur->bc_ops->init_rec_from_cur(cur, &rec);
4814         cur->bc_ops->init_key_from_rec(&low_key, &rec);
4815
4816         /* Enforce low key < high key. */
4817         if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
4818                 return -EINVAL;
4819
4820         if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
4821                 return xfs_btree_simple_query_range(cur, &low_key,
4822                                 &high_key, fn, priv);
4823         return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
4824                         fn, priv);
4825 }
4826
4827 /*
4828  * Calculate the number of blocks needed to store a given number of records
4829  * in a short-format (per-AG metadata) btree.
4830  */
4831 xfs_extlen_t
4832 xfs_btree_calc_size(
4833         struct xfs_mount        *mp,
4834         uint                    *limits,
4835         unsigned long long      len)
4836 {
4837         int                     level;
4838         int                     maxrecs;
4839         xfs_extlen_t            rval;
4840
4841         maxrecs = limits[0];
4842         for (level = 0, rval = 0; len > 1; level++) {
4843                 len += maxrecs - 1;
4844                 do_div(len, maxrecs);
4845                 maxrecs = limits[1];
4846                 rval += len;
4847         }
4848         return rval;
4849 }
4850
4851 static int
4852 xfs_btree_count_blocks_helper(
4853         struct xfs_btree_cur    *cur,
4854         int                     level,
4855         void                    *data)
4856 {
4857         xfs_extlen_t            *blocks = data;
4858         (*blocks)++;
4859
4860         return 0;
4861 }
4862
4863 /* Count the blocks in a btree and return the result in *blocks. */
4864 int
4865 xfs_btree_count_blocks(
4866         struct xfs_btree_cur    *cur,
4867         xfs_extlen_t            *blocks)
4868 {
4869         *blocks = 0;
4870         return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
4871                         blocks);
4872 }