]> git.karo-electronics.de Git - karo-tx-linux.git/blob - fs/xfs/xfs_da_btree.c
3770111959756b8b1932892fa179f2cb5265d38d
[karo-tx-linux.git] / fs / xfs / xfs_da_btree.c
1 /*
2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3  * Copyright (c) 2013 Red Hat, Inc.
4  * All Rights Reserved.
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it would be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write the Free Software Foundation,
17  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18  */
19 #include "xfs.h"
20 #include "xfs_fs.h"
21 #include "xfs_shared.h"
22 #include "xfs_format.h"
23 #include "xfs_log_format.h"
24 #include "xfs_trans_resv.h"
25 #include "xfs_bit.h"
26 #include "xfs_sb.h"
27 #include "xfs_ag.h"
28 #include "xfs_mount.h"
29 #include "xfs_da_format.h"
30 #include "xfs_da_btree.h"
31 #include "xfs_bmap_btree.h"
32 #include "xfs_dir2.h"
33 #include "xfs_dir2_priv.h"
34 #include "xfs_dinode.h"
35 #include "xfs_inode.h"
36 #include "xfs_trans.h"
37 #include "xfs_inode_item.h"
38 #include "xfs_alloc.h"
39 #include "xfs_bmap.h"
40 #include "xfs_attr.h"
41 #include "xfs_attr_leaf.h"
42 #include "xfs_error.h"
43 #include "xfs_trace.h"
44 #include "xfs_cksum.h"
45 #include "xfs_buf_item.h"
46
47 /*
48  * xfs_da_btree.c
49  *
50  * Routines to implement directories as Btrees of hashed names.
51  */
52
53 /*========================================================================
54  * Function prototypes for the kernel.
55  *========================================================================*/
56
57 /*
58  * Routines used for growing the Btree.
59  */
60 STATIC int xfs_da3_root_split(xfs_da_state_t *state,
61                                             xfs_da_state_blk_t *existing_root,
62                                             xfs_da_state_blk_t *new_child);
63 STATIC int xfs_da3_node_split(xfs_da_state_t *state,
64                                             xfs_da_state_blk_t *existing_blk,
65                                             xfs_da_state_blk_t *split_blk,
66                                             xfs_da_state_blk_t *blk_to_add,
67                                             int treelevel,
68                                             int *result);
69 STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
70                                          xfs_da_state_blk_t *node_blk_1,
71                                          xfs_da_state_blk_t *node_blk_2);
72 STATIC void xfs_da3_node_add(xfs_da_state_t *state,
73                                    xfs_da_state_blk_t *old_node_blk,
74                                    xfs_da_state_blk_t *new_node_blk);
75
76 /*
77  * Routines used for shrinking the Btree.
78  */
79 STATIC int xfs_da3_root_join(xfs_da_state_t *state,
80                                            xfs_da_state_blk_t *root_blk);
81 STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
82 STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
83                                               xfs_da_state_blk_t *drop_blk);
84 STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
85                                          xfs_da_state_blk_t *src_node_blk,
86                                          xfs_da_state_blk_t *dst_node_blk);
87
88 /*
89  * Utility routines.
90  */
91 STATIC int      xfs_da3_blk_unlink(xfs_da_state_t *state,
92                                   xfs_da_state_blk_t *drop_blk,
93                                   xfs_da_state_blk_t *save_blk);
94
95
96 kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
97
98 /*
99  * Allocate a dir-state structure.
100  * We don't put them on the stack since they're large.
101  */
102 xfs_da_state_t *
103 xfs_da_state_alloc(void)
104 {
105         return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
106 }
107
108 /*
109  * Kill the altpath contents of a da-state structure.
110  */
111 STATIC void
112 xfs_da_state_kill_altpath(xfs_da_state_t *state)
113 {
114         int     i;
115
116         for (i = 0; i < state->altpath.active; i++)
117                 state->altpath.blk[i].bp = NULL;
118         state->altpath.active = 0;
119 }
120
121 /*
122  * Free a da-state structure.
123  */
124 void
125 xfs_da_state_free(xfs_da_state_t *state)
126 {
127         xfs_da_state_kill_altpath(state);
128 #ifdef DEBUG
129         memset((char *)state, 0, sizeof(*state));
130 #endif /* DEBUG */
131         kmem_zone_free(xfs_da_state_zone, state);
132 }
133
134 void
135 xfs_da3_node_hdr_from_disk(
136         struct xfs_da3_icnode_hdr       *to,
137         struct xfs_da_intnode           *from)
138 {
139         ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
140                from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
141
142         if (from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
143                 struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
144
145                 to->forw = be32_to_cpu(hdr3->info.hdr.forw);
146                 to->back = be32_to_cpu(hdr3->info.hdr.back);
147                 to->magic = be16_to_cpu(hdr3->info.hdr.magic);
148                 to->count = be16_to_cpu(hdr3->__count);
149                 to->level = be16_to_cpu(hdr3->__level);
150                 return;
151         }
152         to->forw = be32_to_cpu(from->hdr.info.forw);
153         to->back = be32_to_cpu(from->hdr.info.back);
154         to->magic = be16_to_cpu(from->hdr.info.magic);
155         to->count = be16_to_cpu(from->hdr.__count);
156         to->level = be16_to_cpu(from->hdr.__level);
157 }
158
159 void
160 xfs_da3_node_hdr_to_disk(
161         struct xfs_da_intnode           *to,
162         struct xfs_da3_icnode_hdr       *from)
163 {
164         ASSERT(from->magic == XFS_DA_NODE_MAGIC ||
165                from->magic == XFS_DA3_NODE_MAGIC);
166
167         if (from->magic == XFS_DA3_NODE_MAGIC) {
168                 struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
169
170                 hdr3->info.hdr.forw = cpu_to_be32(from->forw);
171                 hdr3->info.hdr.back = cpu_to_be32(from->back);
172                 hdr3->info.hdr.magic = cpu_to_be16(from->magic);
173                 hdr3->__count = cpu_to_be16(from->count);
174                 hdr3->__level = cpu_to_be16(from->level);
175                 return;
176         }
177         to->hdr.info.forw = cpu_to_be32(from->forw);
178         to->hdr.info.back = cpu_to_be32(from->back);
179         to->hdr.info.magic = cpu_to_be16(from->magic);
180         to->hdr.__count = cpu_to_be16(from->count);
181         to->hdr.__level = cpu_to_be16(from->level);
182 }
183
184 static bool
185 xfs_da3_node_verify(
186         struct xfs_buf          *bp)
187 {
188         struct xfs_mount        *mp = bp->b_target->bt_mount;
189         struct xfs_da_intnode   *hdr = bp->b_addr;
190         struct xfs_da3_icnode_hdr ichdr;
191
192         xfs_da3_node_hdr_from_disk(&ichdr, hdr);
193
194         if (xfs_sb_version_hascrc(&mp->m_sb)) {
195                 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
196
197                 if (ichdr.magic != XFS_DA3_NODE_MAGIC)
198                         return false;
199
200                 if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
201                         return false;
202                 if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
203                         return false;
204         } else {
205                 if (ichdr.magic != XFS_DA_NODE_MAGIC)
206                         return false;
207         }
208         if (ichdr.level == 0)
209                 return false;
210         if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
211                 return false;
212         if (ichdr.count == 0)
213                 return false;
214
215         /*
216          * we don't know if the node is for and attribute or directory tree,
217          * so only fail if the count is outside both bounds
218          */
219         if (ichdr.count > mp->m_dir_node_ents &&
220             ichdr.count > mp->m_attr_node_ents)
221                 return false;
222
223         /* XXX: hash order check? */
224
225         return true;
226 }
227
228 static void
229 xfs_da3_node_write_verify(
230         struct xfs_buf  *bp)
231 {
232         struct xfs_mount        *mp = bp->b_target->bt_mount;
233         struct xfs_buf_log_item *bip = bp->b_fspriv;
234         struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
235
236         if (!xfs_da3_node_verify(bp)) {
237                 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
238                 xfs_buf_ioerror(bp, EFSCORRUPTED);
239                 return;
240         }
241
242         if (!xfs_sb_version_hascrc(&mp->m_sb))
243                 return;
244
245         if (bip)
246                 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
247
248         xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DA3_NODE_CRC_OFF);
249 }
250
251 /*
252  * leaf/node format detection on trees is sketchy, so a node read can be done on
253  * leaf level blocks when detection identifies the tree as a node format tree
254  * incorrectly. In this case, we need to swap the verifier to match the correct
255  * format of the block being read.
256  */
257 static void
258 xfs_da3_node_read_verify(
259         struct xfs_buf          *bp)
260 {
261         struct xfs_mount        *mp = bp->b_target->bt_mount;
262         struct xfs_da_blkinfo   *info = bp->b_addr;
263
264         switch (be16_to_cpu(info->magic)) {
265                 case XFS_DA3_NODE_MAGIC:
266                         if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
267                                               XFS_DA3_NODE_CRC_OFF))
268                                 break;
269                         /* fall through */
270                 case XFS_DA_NODE_MAGIC:
271                         if (!xfs_da3_node_verify(bp))
272                                 break;
273                         return;
274                 case XFS_ATTR_LEAF_MAGIC:
275                 case XFS_ATTR3_LEAF_MAGIC:
276                         bp->b_ops = &xfs_attr3_leaf_buf_ops;
277                         bp->b_ops->verify_read(bp);
278                         return;
279                 case XFS_DIR2_LEAFN_MAGIC:
280                 case XFS_DIR3_LEAFN_MAGIC:
281                         bp->b_ops = &xfs_dir3_leafn_buf_ops;
282                         bp->b_ops->verify_read(bp);
283                         return;
284                 default:
285                         break;
286         }
287
288         /* corrupt block */
289         XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
290         xfs_buf_ioerror(bp, EFSCORRUPTED);
291 }
292
293 const struct xfs_buf_ops xfs_da3_node_buf_ops = {
294         .verify_read = xfs_da3_node_read_verify,
295         .verify_write = xfs_da3_node_write_verify,
296 };
297
298 int
299 xfs_da3_node_read(
300         struct xfs_trans        *tp,
301         struct xfs_inode        *dp,
302         xfs_dablk_t             bno,
303         xfs_daddr_t             mappedbno,
304         struct xfs_buf          **bpp,
305         int                     which_fork)
306 {
307         int                     err;
308
309         err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
310                                         which_fork, &xfs_da3_node_buf_ops);
311         if (!err && tp) {
312                 struct xfs_da_blkinfo   *info = (*bpp)->b_addr;
313                 int                     type;
314
315                 switch (be16_to_cpu(info->magic)) {
316                 case XFS_DA_NODE_MAGIC:
317                 case XFS_DA3_NODE_MAGIC:
318                         type = XFS_BLFT_DA_NODE_BUF;
319                         break;
320                 case XFS_ATTR_LEAF_MAGIC:
321                 case XFS_ATTR3_LEAF_MAGIC:
322                         type = XFS_BLFT_ATTR_LEAF_BUF;
323                         break;
324                 case XFS_DIR2_LEAFN_MAGIC:
325                 case XFS_DIR3_LEAFN_MAGIC:
326                         type = XFS_BLFT_DIR_LEAFN_BUF;
327                         break;
328                 default:
329                         type = 0;
330                         ASSERT(0);
331                         break;
332                 }
333                 xfs_trans_buf_set_type(tp, *bpp, type);
334         }
335         return err;
336 }
337
338 /*========================================================================
339  * Routines used for growing the Btree.
340  *========================================================================*/
341
342 /*
343  * Create the initial contents of an intermediate node.
344  */
345 int
346 xfs_da3_node_create(
347         struct xfs_da_args      *args,
348         xfs_dablk_t             blkno,
349         int                     level,
350         struct xfs_buf          **bpp,
351         int                     whichfork)
352 {
353         struct xfs_da_intnode   *node;
354         struct xfs_trans        *tp = args->trans;
355         struct xfs_mount        *mp = tp->t_mountp;
356         struct xfs_da3_icnode_hdr ichdr = {0};
357         struct xfs_buf          *bp;
358         int                     error;
359
360         trace_xfs_da_node_create(args);
361         ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
362
363         error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
364         if (error)
365                 return(error);
366         bp->b_ops = &xfs_da3_node_buf_ops;
367         xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
368         node = bp->b_addr;
369
370         if (xfs_sb_version_hascrc(&mp->m_sb)) {
371                 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
372
373                 ichdr.magic = XFS_DA3_NODE_MAGIC;
374                 hdr3->info.blkno = cpu_to_be64(bp->b_bn);
375                 hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
376                 uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
377         } else {
378                 ichdr.magic = XFS_DA_NODE_MAGIC;
379         }
380         ichdr.level = level;
381
382         xfs_da3_node_hdr_to_disk(node, &ichdr);
383         xfs_trans_log_buf(tp, bp,
384                 XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node)));
385
386         *bpp = bp;
387         return(0);
388 }
389
390 /*
391  * Split a leaf node, rebalance, then possibly split
392  * intermediate nodes, rebalance, etc.
393  */
394 int                                                     /* error */
395 xfs_da3_split(
396         struct xfs_da_state     *state)
397 {
398         struct xfs_da_state_blk *oldblk;
399         struct xfs_da_state_blk *newblk;
400         struct xfs_da_state_blk *addblk;
401         struct xfs_da_intnode   *node;
402         struct xfs_buf          *bp;
403         int                     max;
404         int                     action = 0;
405         int                     error;
406         int                     i;
407
408         trace_xfs_da_split(state->args);
409
410         /*
411          * Walk back up the tree splitting/inserting/adjusting as necessary.
412          * If we need to insert and there isn't room, split the node, then
413          * decide which fragment to insert the new block from below into.
414          * Note that we may split the root this way, but we need more fixup.
415          */
416         max = state->path.active - 1;
417         ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
418         ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
419                state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
420
421         addblk = &state->path.blk[max];         /* initial dummy value */
422         for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
423                 oldblk = &state->path.blk[i];
424                 newblk = &state->altpath.blk[i];
425
426                 /*
427                  * If a leaf node then
428                  *     Allocate a new leaf node, then rebalance across them.
429                  * else if an intermediate node then
430                  *     We split on the last layer, must we split the node?
431                  */
432                 switch (oldblk->magic) {
433                 case XFS_ATTR_LEAF_MAGIC:
434                         error = xfs_attr3_leaf_split(state, oldblk, newblk);
435                         if ((error != 0) && (error != ENOSPC)) {
436                                 return(error);  /* GROT: attr is inconsistent */
437                         }
438                         if (!error) {
439                                 addblk = newblk;
440                                 break;
441                         }
442                         /*
443                          * Entry wouldn't fit, split the leaf again.
444                          */
445                         state->extravalid = 1;
446                         if (state->inleaf) {
447                                 state->extraafter = 0;  /* before newblk */
448                                 trace_xfs_attr_leaf_split_before(state->args);
449                                 error = xfs_attr3_leaf_split(state, oldblk,
450                                                             &state->extrablk);
451                         } else {
452                                 state->extraafter = 1;  /* after newblk */
453                                 trace_xfs_attr_leaf_split_after(state->args);
454                                 error = xfs_attr3_leaf_split(state, newblk,
455                                                             &state->extrablk);
456                         }
457                         if (error)
458                                 return(error);  /* GROT: attr inconsistent */
459                         addblk = newblk;
460                         break;
461                 case XFS_DIR2_LEAFN_MAGIC:
462                         error = xfs_dir2_leafn_split(state, oldblk, newblk);
463                         if (error)
464                                 return error;
465                         addblk = newblk;
466                         break;
467                 case XFS_DA_NODE_MAGIC:
468                         error = xfs_da3_node_split(state, oldblk, newblk, addblk,
469                                                          max - i, &action);
470                         addblk->bp = NULL;
471                         if (error)
472                                 return(error);  /* GROT: dir is inconsistent */
473                         /*
474                          * Record the newly split block for the next time thru?
475                          */
476                         if (action)
477                                 addblk = newblk;
478                         else
479                                 addblk = NULL;
480                         break;
481                 }
482
483                 /*
484                  * Update the btree to show the new hashval for this child.
485                  */
486                 xfs_da3_fixhashpath(state, &state->path);
487         }
488         if (!addblk)
489                 return(0);
490
491         /*
492          * Split the root node.
493          */
494         ASSERT(state->path.active == 0);
495         oldblk = &state->path.blk[0];
496         error = xfs_da3_root_split(state, oldblk, addblk);
497         if (error) {
498                 addblk->bp = NULL;
499                 return(error);  /* GROT: dir is inconsistent */
500         }
501
502         /*
503          * Update pointers to the node which used to be block 0 and
504          * just got bumped because of the addition of a new root node.
505          * There might be three blocks involved if a double split occurred,
506          * and the original block 0 could be at any position in the list.
507          *
508          * Note: the magic numbers and sibling pointers are in the same
509          * physical place for both v2 and v3 headers (by design). Hence it
510          * doesn't matter which version of the xfs_da_intnode structure we use
511          * here as the result will be the same using either structure.
512          */
513         node = oldblk->bp->b_addr;
514         if (node->hdr.info.forw) {
515                 if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
516                         bp = addblk->bp;
517                 } else {
518                         ASSERT(state->extravalid);
519                         bp = state->extrablk.bp;
520                 }
521                 node = bp->b_addr;
522                 node->hdr.info.back = cpu_to_be32(oldblk->blkno);
523                 xfs_trans_log_buf(state->args->trans, bp,
524                     XFS_DA_LOGRANGE(node, &node->hdr.info,
525                     sizeof(node->hdr.info)));
526         }
527         node = oldblk->bp->b_addr;
528         if (node->hdr.info.back) {
529                 if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
530                         bp = addblk->bp;
531                 } else {
532                         ASSERT(state->extravalid);
533                         bp = state->extrablk.bp;
534                 }
535                 node = bp->b_addr;
536                 node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
537                 xfs_trans_log_buf(state->args->trans, bp,
538                     XFS_DA_LOGRANGE(node, &node->hdr.info,
539                     sizeof(node->hdr.info)));
540         }
541         addblk->bp = NULL;
542         return(0);
543 }
544
545 /*
546  * Split the root.  We have to create a new root and point to the two
547  * parts (the split old root) that we just created.  Copy block zero to
548  * the EOF, extending the inode in process.
549  */
550 STATIC int                                              /* error */
551 xfs_da3_root_split(
552         struct xfs_da_state     *state,
553         struct xfs_da_state_blk *blk1,
554         struct xfs_da_state_blk *blk2)
555 {
556         struct xfs_da_intnode   *node;
557         struct xfs_da_intnode   *oldroot;
558         struct xfs_da_node_entry *btree;
559         struct xfs_da3_icnode_hdr nodehdr;
560         struct xfs_da_args      *args;
561         struct xfs_buf          *bp;
562         struct xfs_inode        *dp;
563         struct xfs_trans        *tp;
564         struct xfs_mount        *mp;
565         struct xfs_dir2_leaf    *leaf;
566         xfs_dablk_t             blkno;
567         int                     level;
568         int                     error;
569         int                     size;
570
571         trace_xfs_da_root_split(state->args);
572
573         /*
574          * Copy the existing (incorrect) block from the root node position
575          * to a free space somewhere.
576          */
577         args = state->args;
578         error = xfs_da_grow_inode(args, &blkno);
579         if (error)
580                 return error;
581
582         dp = args->dp;
583         tp = args->trans;
584         mp = state->mp;
585         error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
586         if (error)
587                 return error;
588         node = bp->b_addr;
589         oldroot = blk1->bp->b_addr;
590         if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
591             oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
592                 struct xfs_da3_icnode_hdr nodehdr;
593
594                 xfs_da3_node_hdr_from_disk(&nodehdr, oldroot);
595                 btree = xfs_da3_node_tree_p(oldroot);
596                 size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
597                 level = nodehdr.level;
598
599                 /*
600                  * we are about to copy oldroot to bp, so set up the type
601                  * of bp while we know exactly what it will be.
602                  */
603                 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
604         } else {
605                 struct xfs_dir3_icleaf_hdr leafhdr;
606                 struct xfs_dir2_leaf_entry *ents;
607
608                 leaf = (xfs_dir2_leaf_t *)oldroot;
609                 xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
610                 ents = xfs_dir3_leaf_ents_p(leaf);
611
612                 ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
613                        leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
614                 size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
615                 level = 0;
616
617                 /*
618                  * we are about to copy oldroot to bp, so set up the type
619                  * of bp while we know exactly what it will be.
620                  */
621                 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
622         }
623
624         /*
625          * we can copy most of the information in the node from one block to
626          * another, but for CRC enabled headers we have to make sure that the
627          * block specific identifiers are kept intact. We update the buffer
628          * directly for this.
629          */
630         memcpy(node, oldroot, size);
631         if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
632             oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
633                 struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
634
635                 node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
636         }
637         xfs_trans_log_buf(tp, bp, 0, size - 1);
638
639         bp->b_ops = blk1->bp->b_ops;
640         xfs_trans_buf_copy_type(bp, blk1->bp);
641         blk1->bp = bp;
642         blk1->blkno = blkno;
643
644         /*
645          * Set up the new root node.
646          */
647         error = xfs_da3_node_create(args,
648                 (args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0,
649                 level + 1, &bp, args->whichfork);
650         if (error)
651                 return error;
652
653         node = bp->b_addr;
654         xfs_da3_node_hdr_from_disk(&nodehdr, node);
655         btree = xfs_da3_node_tree_p(node);
656         btree[0].hashval = cpu_to_be32(blk1->hashval);
657         btree[0].before = cpu_to_be32(blk1->blkno);
658         btree[1].hashval = cpu_to_be32(blk2->hashval);
659         btree[1].before = cpu_to_be32(blk2->blkno);
660         nodehdr.count = 2;
661         xfs_da3_node_hdr_to_disk(node, &nodehdr);
662
663 #ifdef DEBUG
664         if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
665             oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
666                 ASSERT(blk1->blkno >= mp->m_dirleafblk &&
667                        blk1->blkno < mp->m_dirfreeblk);
668                 ASSERT(blk2->blkno >= mp->m_dirleafblk &&
669                        blk2->blkno < mp->m_dirfreeblk);
670         }
671 #endif
672
673         /* Header is already logged by xfs_da_node_create */
674         xfs_trans_log_buf(tp, bp,
675                 XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
676
677         return 0;
678 }
679
680 /*
681  * Split the node, rebalance, then add the new entry.
682  */
683 STATIC int                                              /* error */
684 xfs_da3_node_split(
685         struct xfs_da_state     *state,
686         struct xfs_da_state_blk *oldblk,
687         struct xfs_da_state_blk *newblk,
688         struct xfs_da_state_blk *addblk,
689         int                     treelevel,
690         int                     *result)
691 {
692         struct xfs_da_intnode   *node;
693         struct xfs_da3_icnode_hdr nodehdr;
694         xfs_dablk_t             blkno;
695         int                     newcount;
696         int                     error;
697         int                     useextra;
698
699         trace_xfs_da_node_split(state->args);
700
701         node = oldblk->bp->b_addr;
702         xfs_da3_node_hdr_from_disk(&nodehdr, node);
703
704         /*
705          * With V2 dirs the extra block is data or freespace.
706          */
707         useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
708         newcount = 1 + useextra;
709         /*
710          * Do we have to split the node?
711          */
712         if (nodehdr.count + newcount > state->node_ents) {
713                 /*
714                  * Allocate a new node, add to the doubly linked chain of
715                  * nodes, then move some of our excess entries into it.
716                  */
717                 error = xfs_da_grow_inode(state->args, &blkno);
718                 if (error)
719                         return(error);  /* GROT: dir is inconsistent */
720
721                 error = xfs_da3_node_create(state->args, blkno, treelevel,
722                                            &newblk->bp, state->args->whichfork);
723                 if (error)
724                         return(error);  /* GROT: dir is inconsistent */
725                 newblk->blkno = blkno;
726                 newblk->magic = XFS_DA_NODE_MAGIC;
727                 xfs_da3_node_rebalance(state, oldblk, newblk);
728                 error = xfs_da3_blk_link(state, oldblk, newblk);
729                 if (error)
730                         return(error);
731                 *result = 1;
732         } else {
733                 *result = 0;
734         }
735
736         /*
737          * Insert the new entry(s) into the correct block
738          * (updating last hashval in the process).
739          *
740          * xfs_da3_node_add() inserts BEFORE the given index,
741          * and as a result of using node_lookup_int() we always
742          * point to a valid entry (not after one), but a split
743          * operation always results in a new block whose hashvals
744          * FOLLOW the current block.
745          *
746          * If we had double-split op below us, then add the extra block too.
747          */
748         node = oldblk->bp->b_addr;
749         xfs_da3_node_hdr_from_disk(&nodehdr, node);
750         if (oldblk->index <= nodehdr.count) {
751                 oldblk->index++;
752                 xfs_da3_node_add(state, oldblk, addblk);
753                 if (useextra) {
754                         if (state->extraafter)
755                                 oldblk->index++;
756                         xfs_da3_node_add(state, oldblk, &state->extrablk);
757                         state->extravalid = 0;
758                 }
759         } else {
760                 newblk->index++;
761                 xfs_da3_node_add(state, newblk, addblk);
762                 if (useextra) {
763                         if (state->extraafter)
764                                 newblk->index++;
765                         xfs_da3_node_add(state, newblk, &state->extrablk);
766                         state->extravalid = 0;
767                 }
768         }
769
770         return(0);
771 }
772
773 /*
774  * Balance the btree elements between two intermediate nodes,
775  * usually one full and one empty.
776  *
777  * NOTE: if blk2 is empty, then it will get the upper half of blk1.
778  */
779 STATIC void
780 xfs_da3_node_rebalance(
781         struct xfs_da_state     *state,
782         struct xfs_da_state_blk *blk1,
783         struct xfs_da_state_blk *blk2)
784 {
785         struct xfs_da_intnode   *node1;
786         struct xfs_da_intnode   *node2;
787         struct xfs_da_intnode   *tmpnode;
788         struct xfs_da_node_entry *btree1;
789         struct xfs_da_node_entry *btree2;
790         struct xfs_da_node_entry *btree_s;
791         struct xfs_da_node_entry *btree_d;
792         struct xfs_da3_icnode_hdr nodehdr1;
793         struct xfs_da3_icnode_hdr nodehdr2;
794         struct xfs_trans        *tp;
795         int                     count;
796         int                     tmp;
797         int                     swap = 0;
798
799         trace_xfs_da_node_rebalance(state->args);
800
801         node1 = blk1->bp->b_addr;
802         node2 = blk2->bp->b_addr;
803         xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
804         xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
805         btree1 = xfs_da3_node_tree_p(node1);
806         btree2 = xfs_da3_node_tree_p(node2);
807
808         /*
809          * Figure out how many entries need to move, and in which direction.
810          * Swap the nodes around if that makes it simpler.
811          */
812         if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
813             ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
814              (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
815                         be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
816                 tmpnode = node1;
817                 node1 = node2;
818                 node2 = tmpnode;
819                 xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
820                 xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
821                 btree1 = xfs_da3_node_tree_p(node1);
822                 btree2 = xfs_da3_node_tree_p(node2);
823                 swap = 1;
824         }
825
826         count = (nodehdr1.count - nodehdr2.count) / 2;
827         if (count == 0)
828                 return;
829         tp = state->args->trans;
830         /*
831          * Two cases: high-to-low and low-to-high.
832          */
833         if (count > 0) {
834                 /*
835                  * Move elements in node2 up to make a hole.
836                  */
837                 tmp = nodehdr2.count;
838                 if (tmp > 0) {
839                         tmp *= (uint)sizeof(xfs_da_node_entry_t);
840                         btree_s = &btree2[0];
841                         btree_d = &btree2[count];
842                         memmove(btree_d, btree_s, tmp);
843                 }
844
845                 /*
846                  * Move the req'd B-tree elements from high in node1 to
847                  * low in node2.
848                  */
849                 nodehdr2.count += count;
850                 tmp = count * (uint)sizeof(xfs_da_node_entry_t);
851                 btree_s = &btree1[nodehdr1.count - count];
852                 btree_d = &btree2[0];
853                 memcpy(btree_d, btree_s, tmp);
854                 nodehdr1.count -= count;
855         } else {
856                 /*
857                  * Move the req'd B-tree elements from low in node2 to
858                  * high in node1.
859                  */
860                 count = -count;
861                 tmp = count * (uint)sizeof(xfs_da_node_entry_t);
862                 btree_s = &btree2[0];
863                 btree_d = &btree1[nodehdr1.count];
864                 memcpy(btree_d, btree_s, tmp);
865                 nodehdr1.count += count;
866
867                 xfs_trans_log_buf(tp, blk1->bp,
868                         XFS_DA_LOGRANGE(node1, btree_d, tmp));
869
870                 /*
871                  * Move elements in node2 down to fill the hole.
872                  */
873                 tmp  = nodehdr2.count - count;
874                 tmp *= (uint)sizeof(xfs_da_node_entry_t);
875                 btree_s = &btree2[count];
876                 btree_d = &btree2[0];
877                 memmove(btree_d, btree_s, tmp);
878                 nodehdr2.count -= count;
879         }
880
881         /*
882          * Log header of node 1 and all current bits of node 2.
883          */
884         xfs_da3_node_hdr_to_disk(node1, &nodehdr1);
885         xfs_trans_log_buf(tp, blk1->bp,
886                 XFS_DA_LOGRANGE(node1, &node1->hdr,
887                                 xfs_da3_node_hdr_size(node1)));
888
889         xfs_da3_node_hdr_to_disk(node2, &nodehdr2);
890         xfs_trans_log_buf(tp, blk2->bp,
891                 XFS_DA_LOGRANGE(node2, &node2->hdr,
892                                 xfs_da3_node_hdr_size(node2) +
893                                 (sizeof(btree2[0]) * nodehdr2.count)));
894
895         /*
896          * Record the last hashval from each block for upward propagation.
897          * (note: don't use the swapped node pointers)
898          */
899         if (swap) {
900                 node1 = blk1->bp->b_addr;
901                 node2 = blk2->bp->b_addr;
902                 xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
903                 xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
904                 btree1 = xfs_da3_node_tree_p(node1);
905                 btree2 = xfs_da3_node_tree_p(node2);
906         }
907         blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
908         blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
909
910         /*
911          * Adjust the expected index for insertion.
912          */
913         if (blk1->index >= nodehdr1.count) {
914                 blk2->index = blk1->index - nodehdr1.count;
915                 blk1->index = nodehdr1.count + 1;       /* make it invalid */
916         }
917 }
918
919 /*
920  * Add a new entry to an intermediate node.
921  */
922 STATIC void
923 xfs_da3_node_add(
924         struct xfs_da_state     *state,
925         struct xfs_da_state_blk *oldblk,
926         struct xfs_da_state_blk *newblk)
927 {
928         struct xfs_da_intnode   *node;
929         struct xfs_da3_icnode_hdr nodehdr;
930         struct xfs_da_node_entry *btree;
931         int                     tmp;
932
933         trace_xfs_da_node_add(state->args);
934
935         node = oldblk->bp->b_addr;
936         xfs_da3_node_hdr_from_disk(&nodehdr, node);
937         btree = xfs_da3_node_tree_p(node);
938
939         ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
940         ASSERT(newblk->blkno != 0);
941         if (state->args->whichfork == XFS_DATA_FORK)
942                 ASSERT(newblk->blkno >= state->mp->m_dirleafblk &&
943                        newblk->blkno < state->mp->m_dirfreeblk);
944
945         /*
946          * We may need to make some room before we insert the new node.
947          */
948         tmp = 0;
949         if (oldblk->index < nodehdr.count) {
950                 tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
951                 memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
952         }
953         btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
954         btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
955         xfs_trans_log_buf(state->args->trans, oldblk->bp,
956                 XFS_DA_LOGRANGE(node, &btree[oldblk->index],
957                                 tmp + sizeof(*btree)));
958
959         nodehdr.count += 1;
960         xfs_da3_node_hdr_to_disk(node, &nodehdr);
961         xfs_trans_log_buf(state->args->trans, oldblk->bp,
962                 XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node)));
963
964         /*
965          * Copy the last hash value from the oldblk to propagate upwards.
966          */
967         oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
968 }
969
970 /*========================================================================
971  * Routines used for shrinking the Btree.
972  *========================================================================*/
973
974 /*
975  * Deallocate an empty leaf node, remove it from its parent,
976  * possibly deallocating that block, etc...
977  */
978 int
979 xfs_da3_join(
980         struct xfs_da_state     *state)
981 {
982         struct xfs_da_state_blk *drop_blk;
983         struct xfs_da_state_blk *save_blk;
984         int                     action = 0;
985         int                     error;
986
987         trace_xfs_da_join(state->args);
988
989         drop_blk = &state->path.blk[ state->path.active-1 ];
990         save_blk = &state->altpath.blk[ state->path.active-1 ];
991         ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
992         ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
993                drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
994
995         /*
996          * Walk back up the tree joining/deallocating as necessary.
997          * When we stop dropping blocks, break out.
998          */
999         for (  ; state->path.active >= 2; drop_blk--, save_blk--,
1000                  state->path.active--) {
1001                 /*
1002                  * See if we can combine the block with a neighbor.
1003                  *   (action == 0) => no options, just leave
1004                  *   (action == 1) => coalesce, then unlink
1005                  *   (action == 2) => block empty, unlink it
1006                  */
1007                 switch (drop_blk->magic) {
1008                 case XFS_ATTR_LEAF_MAGIC:
1009                         error = xfs_attr3_leaf_toosmall(state, &action);
1010                         if (error)
1011                                 return(error);
1012                         if (action == 0)
1013                                 return(0);
1014                         xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
1015                         break;
1016                 case XFS_DIR2_LEAFN_MAGIC:
1017                         error = xfs_dir2_leafn_toosmall(state, &action);
1018                         if (error)
1019                                 return error;
1020                         if (action == 0)
1021                                 return 0;
1022                         xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
1023                         break;
1024                 case XFS_DA_NODE_MAGIC:
1025                         /*
1026                          * Remove the offending node, fixup hashvals,
1027                          * check for a toosmall neighbor.
1028                          */
1029                         xfs_da3_node_remove(state, drop_blk);
1030                         xfs_da3_fixhashpath(state, &state->path);
1031                         error = xfs_da3_node_toosmall(state, &action);
1032                         if (error)
1033                                 return(error);
1034                         if (action == 0)
1035                                 return 0;
1036                         xfs_da3_node_unbalance(state, drop_blk, save_blk);
1037                         break;
1038                 }
1039                 xfs_da3_fixhashpath(state, &state->altpath);
1040                 error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
1041                 xfs_da_state_kill_altpath(state);
1042                 if (error)
1043                         return(error);
1044                 error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
1045                                                          drop_blk->bp);
1046                 drop_blk->bp = NULL;
1047                 if (error)
1048                         return(error);
1049         }
1050         /*
1051          * We joined all the way to the top.  If it turns out that
1052          * we only have one entry in the root, make the child block
1053          * the new root.
1054          */
1055         xfs_da3_node_remove(state, drop_blk);
1056         xfs_da3_fixhashpath(state, &state->path);
1057         error = xfs_da3_root_join(state, &state->path.blk[0]);
1058         return(error);
1059 }
1060
1061 #ifdef  DEBUG
1062 static void
1063 xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
1064 {
1065         __be16  magic = blkinfo->magic;
1066
1067         if (level == 1) {
1068                 ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
1069                        magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
1070                        magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
1071                        magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
1072         } else {
1073                 ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
1074                        magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
1075         }
1076         ASSERT(!blkinfo->forw);
1077         ASSERT(!blkinfo->back);
1078 }
1079 #else   /* !DEBUG */
1080 #define xfs_da_blkinfo_onlychild_validate(blkinfo, level)
1081 #endif  /* !DEBUG */
1082
1083 /*
1084  * We have only one entry in the root.  Copy the only remaining child of
1085  * the old root to block 0 as the new root node.
1086  */
1087 STATIC int
1088 xfs_da3_root_join(
1089         struct xfs_da_state     *state,
1090         struct xfs_da_state_blk *root_blk)
1091 {
1092         struct xfs_da_intnode   *oldroot;
1093         struct xfs_da_args      *args;
1094         xfs_dablk_t             child;
1095         struct xfs_buf          *bp;
1096         struct xfs_da3_icnode_hdr oldroothdr;
1097         struct xfs_da_node_entry *btree;
1098         int                     error;
1099
1100         trace_xfs_da_root_join(state->args);
1101
1102         ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
1103
1104         args = state->args;
1105         oldroot = root_blk->bp->b_addr;
1106         xfs_da3_node_hdr_from_disk(&oldroothdr, oldroot);
1107         ASSERT(oldroothdr.forw == 0);
1108         ASSERT(oldroothdr.back == 0);
1109
1110         /*
1111          * If the root has more than one child, then don't do anything.
1112          */
1113         if (oldroothdr.count > 1)
1114                 return 0;
1115
1116         /*
1117          * Read in the (only) child block, then copy those bytes into
1118          * the root block's buffer and free the original child block.
1119          */
1120         btree = xfs_da3_node_tree_p(oldroot);
1121         child = be32_to_cpu(btree[0].before);
1122         ASSERT(child != 0);
1123         error = xfs_da3_node_read(args->trans, args->dp, child, -1, &bp,
1124                                              args->whichfork);
1125         if (error)
1126                 return error;
1127         xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
1128
1129         /*
1130          * This could be copying a leaf back into the root block in the case of
1131          * there only being a single leaf block left in the tree. Hence we have
1132          * to update the b_ops pointer as well to match the buffer type change
1133          * that could occur. For dir3 blocks we also need to update the block
1134          * number in the buffer header.
1135          */
1136         memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
1137         root_blk->bp->b_ops = bp->b_ops;
1138         xfs_trans_buf_copy_type(root_blk->bp, bp);
1139         if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
1140                 struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
1141                 da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
1142         }
1143         xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
1144         error = xfs_da_shrink_inode(args, child, bp);
1145         return(error);
1146 }
1147
1148 /*
1149  * Check a node block and its neighbors to see if the block should be
1150  * collapsed into one or the other neighbor.  Always keep the block
1151  * with the smaller block number.
1152  * If the current block is over 50% full, don't try to join it, return 0.
1153  * If the block is empty, fill in the state structure and return 2.
1154  * If it can be collapsed, fill in the state structure and return 1.
1155  * If nothing can be done, return 0.
1156  */
1157 STATIC int
1158 xfs_da3_node_toosmall(
1159         struct xfs_da_state     *state,
1160         int                     *action)
1161 {
1162         struct xfs_da_intnode   *node;
1163         struct xfs_da_state_blk *blk;
1164         struct xfs_da_blkinfo   *info;
1165         xfs_dablk_t             blkno;
1166         struct xfs_buf          *bp;
1167         struct xfs_da3_icnode_hdr nodehdr;
1168         int                     count;
1169         int                     forward;
1170         int                     error;
1171         int                     retval;
1172         int                     i;
1173
1174         trace_xfs_da_node_toosmall(state->args);
1175
1176         /*
1177          * Check for the degenerate case of the block being over 50% full.
1178          * If so, it's not worth even looking to see if we might be able
1179          * to coalesce with a sibling.
1180          */
1181         blk = &state->path.blk[ state->path.active-1 ];
1182         info = blk->bp->b_addr;
1183         node = (xfs_da_intnode_t *)info;
1184         xfs_da3_node_hdr_from_disk(&nodehdr, node);
1185         if (nodehdr.count > (state->node_ents >> 1)) {
1186                 *action = 0;    /* blk over 50%, don't try to join */
1187                 return(0);      /* blk over 50%, don't try to join */
1188         }
1189
1190         /*
1191          * Check for the degenerate case of the block being empty.
1192          * If the block is empty, we'll simply delete it, no need to
1193          * coalesce it with a sibling block.  We choose (arbitrarily)
1194          * to merge with the forward block unless it is NULL.
1195          */
1196         if (nodehdr.count == 0) {
1197                 /*
1198                  * Make altpath point to the block we want to keep and
1199                  * path point to the block we want to drop (this one).
1200                  */
1201                 forward = (info->forw != 0);
1202                 memcpy(&state->altpath, &state->path, sizeof(state->path));
1203                 error = xfs_da3_path_shift(state, &state->altpath, forward,
1204                                                  0, &retval);
1205                 if (error)
1206                         return(error);
1207                 if (retval) {
1208                         *action = 0;
1209                 } else {
1210                         *action = 2;
1211                 }
1212                 return(0);
1213         }
1214
1215         /*
1216          * Examine each sibling block to see if we can coalesce with
1217          * at least 25% free space to spare.  We need to figure out
1218          * whether to merge with the forward or the backward block.
1219          * We prefer coalescing with the lower numbered sibling so as
1220          * to shrink a directory over time.
1221          */
1222         count  = state->node_ents;
1223         count -= state->node_ents >> 2;
1224         count -= nodehdr.count;
1225
1226         /* start with smaller blk num */
1227         forward = nodehdr.forw < nodehdr.back;
1228         for (i = 0; i < 2; forward = !forward, i++) {
1229                 struct xfs_da3_icnode_hdr thdr;
1230                 if (forward)
1231                         blkno = nodehdr.forw;
1232                 else
1233                         blkno = nodehdr.back;
1234                 if (blkno == 0)
1235                         continue;
1236                 error = xfs_da3_node_read(state->args->trans, state->args->dp,
1237                                         blkno, -1, &bp, state->args->whichfork);
1238                 if (error)
1239                         return(error);
1240
1241                 node = bp->b_addr;
1242                 xfs_da3_node_hdr_from_disk(&thdr, node);
1243                 xfs_trans_brelse(state->args->trans, bp);
1244
1245                 if (count - thdr.count >= 0)
1246                         break;  /* fits with at least 25% to spare */
1247         }
1248         if (i >= 2) {
1249                 *action = 0;
1250                 return 0;
1251         }
1252
1253         /*
1254          * Make altpath point to the block we want to keep (the lower
1255          * numbered block) and path point to the block we want to drop.
1256          */
1257         memcpy(&state->altpath, &state->path, sizeof(state->path));
1258         if (blkno < blk->blkno) {
1259                 error = xfs_da3_path_shift(state, &state->altpath, forward,
1260                                                  0, &retval);
1261         } else {
1262                 error = xfs_da3_path_shift(state, &state->path, forward,
1263                                                  0, &retval);
1264         }
1265         if (error)
1266                 return error;
1267         if (retval) {
1268                 *action = 0;
1269                 return 0;
1270         }
1271         *action = 1;
1272         return 0;
1273 }
1274
1275 /*
1276  * Pick up the last hashvalue from an intermediate node.
1277  */
1278 STATIC uint
1279 xfs_da3_node_lasthash(
1280         struct xfs_buf          *bp,
1281         int                     *count)
1282 {
1283         struct xfs_da_intnode    *node;
1284         struct xfs_da_node_entry *btree;
1285         struct xfs_da3_icnode_hdr nodehdr;
1286
1287         node = bp->b_addr;
1288         xfs_da3_node_hdr_from_disk(&nodehdr, node);
1289         if (count)
1290                 *count = nodehdr.count;
1291         if (!nodehdr.count)
1292                 return 0;
1293         btree = xfs_da3_node_tree_p(node);
1294         return be32_to_cpu(btree[nodehdr.count - 1].hashval);
1295 }
1296
1297 /*
1298  * Walk back up the tree adjusting hash values as necessary,
1299  * when we stop making changes, return.
1300  */
1301 void
1302 xfs_da3_fixhashpath(
1303         struct xfs_da_state     *state,
1304         struct xfs_da_state_path *path)
1305 {
1306         struct xfs_da_state_blk *blk;
1307         struct xfs_da_intnode   *node;
1308         struct xfs_da_node_entry *btree;
1309         xfs_dahash_t            lasthash=0;
1310         int                     level;
1311         int                     count;
1312
1313         trace_xfs_da_fixhashpath(state->args);
1314
1315         level = path->active-1;
1316         blk = &path->blk[ level ];
1317         switch (blk->magic) {
1318         case XFS_ATTR_LEAF_MAGIC:
1319                 lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
1320                 if (count == 0)
1321                         return;
1322                 break;
1323         case XFS_DIR2_LEAFN_MAGIC:
1324                 lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
1325                 if (count == 0)
1326                         return;
1327                 break;
1328         case XFS_DA_NODE_MAGIC:
1329                 lasthash = xfs_da3_node_lasthash(blk->bp, &count);
1330                 if (count == 0)
1331                         return;
1332                 break;
1333         }
1334         for (blk--, level--; level >= 0; blk--, level--) {
1335                 struct xfs_da3_icnode_hdr nodehdr;
1336
1337                 node = blk->bp->b_addr;
1338                 xfs_da3_node_hdr_from_disk(&nodehdr, node);
1339                 btree = xfs_da3_node_tree_p(node);
1340                 if (be32_to_cpu(btree->hashval) == lasthash)
1341                         break;
1342                 blk->hashval = lasthash;
1343                 btree[blk->index].hashval = cpu_to_be32(lasthash);
1344                 xfs_trans_log_buf(state->args->trans, blk->bp,
1345                                   XFS_DA_LOGRANGE(node, &btree[blk->index],
1346                                                   sizeof(*btree)));
1347
1348                 lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
1349         }
1350 }
1351
1352 /*
1353  * Remove an entry from an intermediate node.
1354  */
1355 STATIC void
1356 xfs_da3_node_remove(
1357         struct xfs_da_state     *state,
1358         struct xfs_da_state_blk *drop_blk)
1359 {
1360         struct xfs_da_intnode   *node;
1361         struct xfs_da3_icnode_hdr nodehdr;
1362         struct xfs_da_node_entry *btree;
1363         int                     index;
1364         int                     tmp;
1365
1366         trace_xfs_da_node_remove(state->args);
1367
1368         node = drop_blk->bp->b_addr;
1369         xfs_da3_node_hdr_from_disk(&nodehdr, node);
1370         ASSERT(drop_blk->index < nodehdr.count);
1371         ASSERT(drop_blk->index >= 0);
1372
1373         /*
1374          * Copy over the offending entry, or just zero it out.
1375          */
1376         index = drop_blk->index;
1377         btree = xfs_da3_node_tree_p(node);
1378         if (index < nodehdr.count - 1) {
1379                 tmp  = nodehdr.count - index - 1;
1380                 tmp *= (uint)sizeof(xfs_da_node_entry_t);
1381                 memmove(&btree[index], &btree[index + 1], tmp);
1382                 xfs_trans_log_buf(state->args->trans, drop_blk->bp,
1383                     XFS_DA_LOGRANGE(node, &btree[index], tmp));
1384                 index = nodehdr.count - 1;
1385         }
1386         memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
1387         xfs_trans_log_buf(state->args->trans, drop_blk->bp,
1388             XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
1389         nodehdr.count -= 1;
1390         xfs_da3_node_hdr_to_disk(node, &nodehdr);
1391         xfs_trans_log_buf(state->args->trans, drop_blk->bp,
1392             XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node)));
1393
1394         /*
1395          * Copy the last hash value from the block to propagate upwards.
1396          */
1397         drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
1398 }
1399
1400 /*
1401  * Unbalance the elements between two intermediate nodes,
1402  * move all Btree elements from one node into another.
1403  */
1404 STATIC void
1405 xfs_da3_node_unbalance(
1406         struct xfs_da_state     *state,
1407         struct xfs_da_state_blk *drop_blk,
1408         struct xfs_da_state_blk *save_blk)
1409 {
1410         struct xfs_da_intnode   *drop_node;
1411         struct xfs_da_intnode   *save_node;
1412         struct xfs_da_node_entry *drop_btree;
1413         struct xfs_da_node_entry *save_btree;
1414         struct xfs_da3_icnode_hdr drop_hdr;
1415         struct xfs_da3_icnode_hdr save_hdr;
1416         struct xfs_trans        *tp;
1417         int                     sindex;
1418         int                     tmp;
1419
1420         trace_xfs_da_node_unbalance(state->args);
1421
1422         drop_node = drop_blk->bp->b_addr;
1423         save_node = save_blk->bp->b_addr;
1424         xfs_da3_node_hdr_from_disk(&drop_hdr, drop_node);
1425         xfs_da3_node_hdr_from_disk(&save_hdr, save_node);
1426         drop_btree = xfs_da3_node_tree_p(drop_node);
1427         save_btree = xfs_da3_node_tree_p(save_node);
1428         tp = state->args->trans;
1429
1430         /*
1431          * If the dying block has lower hashvals, then move all the
1432          * elements in the remaining block up to make a hole.
1433          */
1434         if ((be32_to_cpu(drop_btree[0].hashval) <
1435                         be32_to_cpu(save_btree[0].hashval)) ||
1436             (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
1437                         be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
1438                 /* XXX: check this - is memmove dst correct? */
1439                 tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
1440                 memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
1441
1442                 sindex = 0;
1443                 xfs_trans_log_buf(tp, save_blk->bp,
1444                         XFS_DA_LOGRANGE(save_node, &save_btree[0],
1445                                 (save_hdr.count + drop_hdr.count) *
1446                                                 sizeof(xfs_da_node_entry_t)));
1447         } else {
1448                 sindex = save_hdr.count;
1449                 xfs_trans_log_buf(tp, save_blk->bp,
1450                         XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
1451                                 drop_hdr.count * sizeof(xfs_da_node_entry_t)));
1452         }
1453
1454         /*
1455          * Move all the B-tree elements from drop_blk to save_blk.
1456          */
1457         tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
1458         memcpy(&save_btree[sindex], &drop_btree[0], tmp);
1459         save_hdr.count += drop_hdr.count;
1460
1461         xfs_da3_node_hdr_to_disk(save_node, &save_hdr);
1462         xfs_trans_log_buf(tp, save_blk->bp,
1463                 XFS_DA_LOGRANGE(save_node, &save_node->hdr,
1464                                 xfs_da3_node_hdr_size(save_node)));
1465
1466         /*
1467          * Save the last hashval in the remaining block for upward propagation.
1468          */
1469         save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
1470 }
1471
1472 /*========================================================================
1473  * Routines used for finding things in the Btree.
1474  *========================================================================*/
1475
1476 /*
1477  * Walk down the Btree looking for a particular filename, filling
1478  * in the state structure as we go.
1479  *
1480  * We will set the state structure to point to each of the elements
1481  * in each of the nodes where either the hashval is or should be.
1482  *
1483  * We support duplicate hashval's so for each entry in the current
1484  * node that could contain the desired hashval, descend.  This is a
1485  * pruned depth-first tree search.
1486  */
1487 int                                                     /* error */
1488 xfs_da3_node_lookup_int(
1489         struct xfs_da_state     *state,
1490         int                     *result)
1491 {
1492         struct xfs_da_state_blk *blk;
1493         struct xfs_da_blkinfo   *curr;
1494         struct xfs_da_intnode   *node;
1495         struct xfs_da_node_entry *btree;
1496         struct xfs_da3_icnode_hdr nodehdr;
1497         struct xfs_da_args      *args;
1498         xfs_dablk_t             blkno;
1499         xfs_dahash_t            hashval;
1500         xfs_dahash_t            btreehashval;
1501         int                     probe;
1502         int                     span;
1503         int                     max;
1504         int                     error;
1505         int                     retval;
1506
1507         args = state->args;
1508
1509         /*
1510          * Descend thru the B-tree searching each level for the right
1511          * node to use, until the right hashval is found.
1512          */
1513         blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0;
1514         for (blk = &state->path.blk[0], state->path.active = 1;
1515                          state->path.active <= XFS_DA_NODE_MAXDEPTH;
1516                          blk++, state->path.active++) {
1517                 /*
1518                  * Read the next node down in the tree.
1519                  */
1520                 blk->blkno = blkno;
1521                 error = xfs_da3_node_read(args->trans, args->dp, blkno,
1522                                         -1, &blk->bp, args->whichfork);
1523                 if (error) {
1524                         blk->blkno = 0;
1525                         state->path.active--;
1526                         return(error);
1527                 }
1528                 curr = blk->bp->b_addr;
1529                 blk->magic = be16_to_cpu(curr->magic);
1530
1531                 if (blk->magic == XFS_ATTR_LEAF_MAGIC ||
1532                     blk->magic == XFS_ATTR3_LEAF_MAGIC) {
1533                         blk->magic = XFS_ATTR_LEAF_MAGIC;
1534                         blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
1535                         break;
1536                 }
1537
1538                 if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1539                     blk->magic == XFS_DIR3_LEAFN_MAGIC) {
1540                         blk->magic = XFS_DIR2_LEAFN_MAGIC;
1541                         blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
1542                         break;
1543                 }
1544
1545                 blk->magic = XFS_DA_NODE_MAGIC;
1546
1547
1548                 /*
1549                  * Search an intermediate node for a match.
1550                  */
1551                 node = blk->bp->b_addr;
1552                 xfs_da3_node_hdr_from_disk(&nodehdr, node);
1553                 btree = xfs_da3_node_tree_p(node);
1554
1555                 max = nodehdr.count;
1556                 blk->hashval = be32_to_cpu(btree[max - 1].hashval);
1557
1558                 /*
1559                  * Binary search.  (note: small blocks will skip loop)
1560                  */
1561                 probe = span = max / 2;
1562                 hashval = args->hashval;
1563                 while (span > 4) {
1564                         span /= 2;
1565                         btreehashval = be32_to_cpu(btree[probe].hashval);
1566                         if (btreehashval < hashval)
1567                                 probe += span;
1568                         else if (btreehashval > hashval)
1569                                 probe -= span;
1570                         else
1571                                 break;
1572                 }
1573                 ASSERT((probe >= 0) && (probe < max));
1574                 ASSERT((span <= 4) ||
1575                         (be32_to_cpu(btree[probe].hashval) == hashval));
1576
1577                 /*
1578                  * Since we may have duplicate hashval's, find the first
1579                  * matching hashval in the node.
1580                  */
1581                 while (probe > 0 &&
1582                        be32_to_cpu(btree[probe].hashval) >= hashval) {
1583                         probe--;
1584                 }
1585                 while (probe < max &&
1586                        be32_to_cpu(btree[probe].hashval) < hashval) {
1587                         probe++;
1588                 }
1589
1590                 /*
1591                  * Pick the right block to descend on.
1592                  */
1593                 if (probe == max) {
1594                         blk->index = max - 1;
1595                         blkno = be32_to_cpu(btree[max - 1].before);
1596                 } else {
1597                         blk->index = probe;
1598                         blkno = be32_to_cpu(btree[probe].before);
1599                 }
1600         }
1601
1602         /*
1603          * A leaf block that ends in the hashval that we are interested in
1604          * (final hashval == search hashval) means that the next block may
1605          * contain more entries with the same hashval, shift upward to the
1606          * next leaf and keep searching.
1607          */
1608         for (;;) {
1609                 if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
1610                         retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
1611                                                         &blk->index, state);
1612                 } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1613                         retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
1614                         blk->index = args->index;
1615                         args->blkno = blk->blkno;
1616                 } else {
1617                         ASSERT(0);
1618                         return XFS_ERROR(EFSCORRUPTED);
1619                 }
1620                 if (((retval == ENOENT) || (retval == ENOATTR)) &&
1621                     (blk->hashval == args->hashval)) {
1622                         error = xfs_da3_path_shift(state, &state->path, 1, 1,
1623                                                          &retval);
1624                         if (error)
1625                                 return(error);
1626                         if (retval == 0) {
1627                                 continue;
1628                         } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1629                                 /* path_shift() gives ENOENT */
1630                                 retval = XFS_ERROR(ENOATTR);
1631                         }
1632                 }
1633                 break;
1634         }
1635         *result = retval;
1636         return(0);
1637 }
1638
1639 /*========================================================================
1640  * Utility routines.
1641  *========================================================================*/
1642
1643 /*
1644  * Compare two intermediate nodes for "order".
1645  */
1646 STATIC int
1647 xfs_da3_node_order(
1648         struct xfs_buf  *node1_bp,
1649         struct xfs_buf  *node2_bp)
1650 {
1651         struct xfs_da_intnode   *node1;
1652         struct xfs_da_intnode   *node2;
1653         struct xfs_da_node_entry *btree1;
1654         struct xfs_da_node_entry *btree2;
1655         struct xfs_da3_icnode_hdr node1hdr;
1656         struct xfs_da3_icnode_hdr node2hdr;
1657
1658         node1 = node1_bp->b_addr;
1659         node2 = node2_bp->b_addr;
1660         xfs_da3_node_hdr_from_disk(&node1hdr, node1);
1661         xfs_da3_node_hdr_from_disk(&node2hdr, node2);
1662         btree1 = xfs_da3_node_tree_p(node1);
1663         btree2 = xfs_da3_node_tree_p(node2);
1664
1665         if (node1hdr.count > 0 && node2hdr.count > 0 &&
1666             ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
1667              (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
1668               be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
1669                 return 1;
1670         }
1671         return 0;
1672 }
1673
1674 /*
1675  * Link a new block into a doubly linked list of blocks (of whatever type).
1676  */
1677 int                                                     /* error */
1678 xfs_da3_blk_link(
1679         struct xfs_da_state     *state,
1680         struct xfs_da_state_blk *old_blk,
1681         struct xfs_da_state_blk *new_blk)
1682 {
1683         struct xfs_da_blkinfo   *old_info;
1684         struct xfs_da_blkinfo   *new_info;
1685         struct xfs_da_blkinfo   *tmp_info;
1686         struct xfs_da_args      *args;
1687         struct xfs_buf          *bp;
1688         int                     before = 0;
1689         int                     error;
1690
1691         /*
1692          * Set up environment.
1693          */
1694         args = state->args;
1695         ASSERT(args != NULL);
1696         old_info = old_blk->bp->b_addr;
1697         new_info = new_blk->bp->b_addr;
1698         ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
1699                old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1700                old_blk->magic == XFS_ATTR_LEAF_MAGIC);
1701
1702         switch (old_blk->magic) {
1703         case XFS_ATTR_LEAF_MAGIC:
1704                 before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
1705                 break;
1706         case XFS_DIR2_LEAFN_MAGIC:
1707                 before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
1708                 break;
1709         case XFS_DA_NODE_MAGIC:
1710                 before = xfs_da3_node_order(old_blk->bp, new_blk->bp);
1711                 break;
1712         }
1713
1714         /*
1715          * Link blocks in appropriate order.
1716          */
1717         if (before) {
1718                 /*
1719                  * Link new block in before existing block.
1720                  */
1721                 trace_xfs_da_link_before(args);
1722                 new_info->forw = cpu_to_be32(old_blk->blkno);
1723                 new_info->back = old_info->back;
1724                 if (old_info->back) {
1725                         error = xfs_da3_node_read(args->trans, args->dp,
1726                                                 be32_to_cpu(old_info->back),
1727                                                 -1, &bp, args->whichfork);
1728                         if (error)
1729                                 return(error);
1730                         ASSERT(bp != NULL);
1731                         tmp_info = bp->b_addr;
1732                         ASSERT(tmp_info->magic == old_info->magic);
1733                         ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
1734                         tmp_info->forw = cpu_to_be32(new_blk->blkno);
1735                         xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
1736                 }
1737                 old_info->back = cpu_to_be32(new_blk->blkno);
1738         } else {
1739                 /*
1740                  * Link new block in after existing block.
1741                  */
1742                 trace_xfs_da_link_after(args);
1743                 new_info->forw = old_info->forw;
1744                 new_info->back = cpu_to_be32(old_blk->blkno);
1745                 if (old_info->forw) {
1746                         error = xfs_da3_node_read(args->trans, args->dp,
1747                                                 be32_to_cpu(old_info->forw),
1748                                                 -1, &bp, args->whichfork);
1749                         if (error)
1750                                 return(error);
1751                         ASSERT(bp != NULL);
1752                         tmp_info = bp->b_addr;
1753                         ASSERT(tmp_info->magic == old_info->magic);
1754                         ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
1755                         tmp_info->back = cpu_to_be32(new_blk->blkno);
1756                         xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
1757                 }
1758                 old_info->forw = cpu_to_be32(new_blk->blkno);
1759         }
1760
1761         xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
1762         xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
1763         return(0);
1764 }
1765
1766 /*
1767  * Unlink a block from a doubly linked list of blocks.
1768  */
1769 STATIC int                                              /* error */
1770 xfs_da3_blk_unlink(
1771         struct xfs_da_state     *state,
1772         struct xfs_da_state_blk *drop_blk,
1773         struct xfs_da_state_blk *save_blk)
1774 {
1775         struct xfs_da_blkinfo   *drop_info;
1776         struct xfs_da_blkinfo   *save_info;
1777         struct xfs_da_blkinfo   *tmp_info;
1778         struct xfs_da_args      *args;
1779         struct xfs_buf          *bp;
1780         int                     error;
1781
1782         /*
1783          * Set up environment.
1784          */
1785         args = state->args;
1786         ASSERT(args != NULL);
1787         save_info = save_blk->bp->b_addr;
1788         drop_info = drop_blk->bp->b_addr;
1789         ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
1790                save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1791                save_blk->magic == XFS_ATTR_LEAF_MAGIC);
1792         ASSERT(save_blk->magic == drop_blk->magic);
1793         ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
1794                (be32_to_cpu(save_info->back) == drop_blk->blkno));
1795         ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
1796                (be32_to_cpu(drop_info->back) == save_blk->blkno));
1797
1798         /*
1799          * Unlink the leaf block from the doubly linked chain of leaves.
1800          */
1801         if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
1802                 trace_xfs_da_unlink_back(args);
1803                 save_info->back = drop_info->back;
1804                 if (drop_info->back) {
1805                         error = xfs_da3_node_read(args->trans, args->dp,
1806                                                 be32_to_cpu(drop_info->back),
1807                                                 -1, &bp, args->whichfork);
1808                         if (error)
1809                                 return(error);
1810                         ASSERT(bp != NULL);
1811                         tmp_info = bp->b_addr;
1812                         ASSERT(tmp_info->magic == save_info->magic);
1813                         ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
1814                         tmp_info->forw = cpu_to_be32(save_blk->blkno);
1815                         xfs_trans_log_buf(args->trans, bp, 0,
1816                                                     sizeof(*tmp_info) - 1);
1817                 }
1818         } else {
1819                 trace_xfs_da_unlink_forward(args);
1820                 save_info->forw = drop_info->forw;
1821                 if (drop_info->forw) {
1822                         error = xfs_da3_node_read(args->trans, args->dp,
1823                                                 be32_to_cpu(drop_info->forw),
1824                                                 -1, &bp, args->whichfork);
1825                         if (error)
1826                                 return(error);
1827                         ASSERT(bp != NULL);
1828                         tmp_info = bp->b_addr;
1829                         ASSERT(tmp_info->magic == save_info->magic);
1830                         ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
1831                         tmp_info->back = cpu_to_be32(save_blk->blkno);
1832                         xfs_trans_log_buf(args->trans, bp, 0,
1833                                                     sizeof(*tmp_info) - 1);
1834                 }
1835         }
1836
1837         xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
1838         return(0);
1839 }
1840
1841 /*
1842  * Move a path "forward" or "!forward" one block at the current level.
1843  *
1844  * This routine will adjust a "path" to point to the next block
1845  * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
1846  * Btree, including updating pointers to the intermediate nodes between
1847  * the new bottom and the root.
1848  */
1849 int                                                     /* error */
1850 xfs_da3_path_shift(
1851         struct xfs_da_state     *state,
1852         struct xfs_da_state_path *path,
1853         int                     forward,
1854         int                     release,
1855         int                     *result)
1856 {
1857         struct xfs_da_state_blk *blk;
1858         struct xfs_da_blkinfo   *info;
1859         struct xfs_da_intnode   *node;
1860         struct xfs_da_args      *args;
1861         struct xfs_da_node_entry *btree;
1862         struct xfs_da3_icnode_hdr nodehdr;
1863         xfs_dablk_t             blkno = 0;
1864         int                     level;
1865         int                     error;
1866
1867         trace_xfs_da_path_shift(state->args);
1868
1869         /*
1870          * Roll up the Btree looking for the first block where our
1871          * current index is not at the edge of the block.  Note that
1872          * we skip the bottom layer because we want the sibling block.
1873          */
1874         args = state->args;
1875         ASSERT(args != NULL);
1876         ASSERT(path != NULL);
1877         ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1878         level = (path->active-1) - 1;   /* skip bottom layer in path */
1879         for (blk = &path->blk[level]; level >= 0; blk--, level--) {
1880                 node = blk->bp->b_addr;
1881                 xfs_da3_node_hdr_from_disk(&nodehdr, node);
1882                 btree = xfs_da3_node_tree_p(node);
1883
1884                 if (forward && (blk->index < nodehdr.count - 1)) {
1885                         blk->index++;
1886                         blkno = be32_to_cpu(btree[blk->index].before);
1887                         break;
1888                 } else if (!forward && (blk->index > 0)) {
1889                         blk->index--;
1890                         blkno = be32_to_cpu(btree[blk->index].before);
1891                         break;
1892                 }
1893         }
1894         if (level < 0) {
1895                 *result = XFS_ERROR(ENOENT);    /* we're out of our tree */
1896                 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
1897                 return(0);
1898         }
1899
1900         /*
1901          * Roll down the edge of the subtree until we reach the
1902          * same depth we were at originally.
1903          */
1904         for (blk++, level++; level < path->active; blk++, level++) {
1905                 /*
1906                  * Release the old block.
1907                  * (if it's dirty, trans won't actually let go)
1908                  */
1909                 if (release)
1910                         xfs_trans_brelse(args->trans, blk->bp);
1911
1912                 /*
1913                  * Read the next child block.
1914                  */
1915                 blk->blkno = blkno;
1916                 error = xfs_da3_node_read(args->trans, args->dp, blkno, -1,
1917                                         &blk->bp, args->whichfork);
1918                 if (error)
1919                         return(error);
1920                 info = blk->bp->b_addr;
1921                 ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
1922                        info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
1923                        info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
1924                        info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
1925                        info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
1926                        info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
1927
1928
1929                 /*
1930                  * Note: we flatten the magic number to a single type so we
1931                  * don't have to compare against crc/non-crc types elsewhere.
1932                  */
1933                 switch (be16_to_cpu(info->magic)) {
1934                 case XFS_DA_NODE_MAGIC:
1935                 case XFS_DA3_NODE_MAGIC:
1936                         blk->magic = XFS_DA_NODE_MAGIC;
1937                         node = (xfs_da_intnode_t *)info;
1938                         xfs_da3_node_hdr_from_disk(&nodehdr, node);
1939                         btree = xfs_da3_node_tree_p(node);
1940                         blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
1941                         if (forward)
1942                                 blk->index = 0;
1943                         else
1944                                 blk->index = nodehdr.count - 1;
1945                         blkno = be32_to_cpu(btree[blk->index].before);
1946                         break;
1947                 case XFS_ATTR_LEAF_MAGIC:
1948                 case XFS_ATTR3_LEAF_MAGIC:
1949                         blk->magic = XFS_ATTR_LEAF_MAGIC;
1950                         ASSERT(level == path->active-1);
1951                         blk->index = 0;
1952                         blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
1953                                                               NULL);
1954                         break;
1955                 case XFS_DIR2_LEAFN_MAGIC:
1956                 case XFS_DIR3_LEAFN_MAGIC:
1957                         blk->magic = XFS_DIR2_LEAFN_MAGIC;
1958                         ASSERT(level == path->active-1);
1959                         blk->index = 0;
1960                         blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
1961                                                                NULL);
1962                         break;
1963                 default:
1964                         ASSERT(0);
1965                         break;
1966                 }
1967         }
1968         *result = 0;
1969         return 0;
1970 }
1971
1972
1973 /*========================================================================
1974  * Utility routines.
1975  *========================================================================*/
1976
1977 /*
1978  * Implement a simple hash on a character string.
1979  * Rotate the hash value by 7 bits, then XOR each character in.
1980  * This is implemented with some source-level loop unrolling.
1981  */
1982 xfs_dahash_t
1983 xfs_da_hashname(const __uint8_t *name, int namelen)
1984 {
1985         xfs_dahash_t hash;
1986
1987         /*
1988          * Do four characters at a time as long as we can.
1989          */
1990         for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
1991                 hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
1992                        (name[3] << 0) ^ rol32(hash, 7 * 4);
1993
1994         /*
1995          * Now do the rest of the characters.
1996          */
1997         switch (namelen) {
1998         case 3:
1999                 return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
2000                        rol32(hash, 7 * 3);
2001         case 2:
2002                 return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
2003         case 1:
2004                 return (name[0] << 0) ^ rol32(hash, 7 * 1);
2005         default: /* case 0: */
2006                 return hash;
2007         }
2008 }
2009
2010 enum xfs_dacmp
2011 xfs_da_compname(
2012         struct xfs_da_args *args,
2013         const unsigned char *name,
2014         int             len)
2015 {
2016         return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
2017                                         XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
2018 }
2019
2020 static xfs_dahash_t
2021 xfs_default_hashname(
2022         struct xfs_name *name)
2023 {
2024         return xfs_da_hashname(name->name, name->len);
2025 }
2026
2027 const struct xfs_nameops xfs_default_nameops = {
2028         .hashname       = xfs_default_hashname,
2029         .compname       = xfs_da_compname
2030 };
2031
2032 int
2033 xfs_da_grow_inode_int(
2034         struct xfs_da_args      *args,
2035         xfs_fileoff_t           *bno,
2036         int                     count)
2037 {
2038         struct xfs_trans        *tp = args->trans;
2039         struct xfs_inode        *dp = args->dp;
2040         int                     w = args->whichfork;
2041         xfs_drfsbno_t           nblks = dp->i_d.di_nblocks;
2042         struct xfs_bmbt_irec    map, *mapp;
2043         int                     nmap, error, got, i, mapi;
2044
2045         /*
2046          * Find a spot in the file space to put the new block.
2047          */
2048         error = xfs_bmap_first_unused(tp, dp, count, bno, w);
2049         if (error)
2050                 return error;
2051
2052         /*
2053          * Try mapping it in one filesystem block.
2054          */
2055         nmap = 1;
2056         ASSERT(args->firstblock != NULL);
2057         error = xfs_bmapi_write(tp, dp, *bno, count,
2058                         xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
2059                         args->firstblock, args->total, &map, &nmap,
2060                         args->flist);
2061         if (error)
2062                 return error;
2063
2064         ASSERT(nmap <= 1);
2065         if (nmap == 1) {
2066                 mapp = &map;
2067                 mapi = 1;
2068         } else if (nmap == 0 && count > 1) {
2069                 xfs_fileoff_t           b;
2070                 int                     c;
2071
2072                 /*
2073                  * If we didn't get it and the block might work if fragmented,
2074                  * try without the CONTIG flag.  Loop until we get it all.
2075                  */
2076                 mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
2077                 for (b = *bno, mapi = 0; b < *bno + count; ) {
2078                         nmap = MIN(XFS_BMAP_MAX_NMAP, count);
2079                         c = (int)(*bno + count - b);
2080                         error = xfs_bmapi_write(tp, dp, b, c,
2081                                         xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
2082                                         args->firstblock, args->total,
2083                                         &mapp[mapi], &nmap, args->flist);
2084                         if (error)
2085                                 goto out_free_map;
2086                         if (nmap < 1)
2087                                 break;
2088                         mapi += nmap;
2089                         b = mapp[mapi - 1].br_startoff +
2090                             mapp[mapi - 1].br_blockcount;
2091                 }
2092         } else {
2093                 mapi = 0;
2094                 mapp = NULL;
2095         }
2096
2097         /*
2098          * Count the blocks we got, make sure it matches the total.
2099          */
2100         for (i = 0, got = 0; i < mapi; i++)
2101                 got += mapp[i].br_blockcount;
2102         if (got != count || mapp[0].br_startoff != *bno ||
2103             mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
2104             *bno + count) {
2105                 error = XFS_ERROR(ENOSPC);
2106                 goto out_free_map;
2107         }
2108
2109         /* account for newly allocated blocks in reserved blocks total */
2110         args->total -= dp->i_d.di_nblocks - nblks;
2111
2112 out_free_map:
2113         if (mapp != &map)
2114                 kmem_free(mapp);
2115         return error;
2116 }
2117
2118 /*
2119  * Add a block to the btree ahead of the file.
2120  * Return the new block number to the caller.
2121  */
2122 int
2123 xfs_da_grow_inode(
2124         struct xfs_da_args      *args,
2125         xfs_dablk_t             *new_blkno)
2126 {
2127         xfs_fileoff_t           bno;
2128         int                     count;
2129         int                     error;
2130
2131         trace_xfs_da_grow_inode(args);
2132
2133         if (args->whichfork == XFS_DATA_FORK) {
2134                 bno = args->dp->i_mount->m_dirleafblk;
2135                 count = args->dp->i_mount->m_dirblkfsbs;
2136         } else {
2137                 bno = 0;
2138                 count = 1;
2139         }
2140
2141         error = xfs_da_grow_inode_int(args, &bno, count);
2142         if (!error)
2143                 *new_blkno = (xfs_dablk_t)bno;
2144         return error;
2145 }
2146
2147 /*
2148  * Ick.  We need to always be able to remove a btree block, even
2149  * if there's no space reservation because the filesystem is full.
2150  * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
2151  * It swaps the target block with the last block in the file.  The
2152  * last block in the file can always be removed since it can't cause
2153  * a bmap btree split to do that.
2154  */
2155 STATIC int
2156 xfs_da3_swap_lastblock(
2157         struct xfs_da_args      *args,
2158         xfs_dablk_t             *dead_blknop,
2159         struct xfs_buf          **dead_bufp)
2160 {
2161         struct xfs_da_blkinfo   *dead_info;
2162         struct xfs_da_blkinfo   *sib_info;
2163         struct xfs_da_intnode   *par_node;
2164         struct xfs_da_intnode   *dead_node;
2165         struct xfs_dir2_leaf    *dead_leaf2;
2166         struct xfs_da_node_entry *btree;
2167         struct xfs_da3_icnode_hdr par_hdr;
2168         struct xfs_inode        *ip;
2169         struct xfs_trans        *tp;
2170         struct xfs_mount        *mp;
2171         struct xfs_buf          *dead_buf;
2172         struct xfs_buf          *last_buf;
2173         struct xfs_buf          *sib_buf;
2174         struct xfs_buf          *par_buf;
2175         xfs_dahash_t            dead_hash;
2176         xfs_fileoff_t           lastoff;
2177         xfs_dablk_t             dead_blkno;
2178         xfs_dablk_t             last_blkno;
2179         xfs_dablk_t             sib_blkno;
2180         xfs_dablk_t             par_blkno;
2181         int                     error;
2182         int                     w;
2183         int                     entno;
2184         int                     level;
2185         int                     dead_level;
2186
2187         trace_xfs_da_swap_lastblock(args);
2188
2189         dead_buf = *dead_bufp;
2190         dead_blkno = *dead_blknop;
2191         tp = args->trans;
2192         ip = args->dp;
2193         w = args->whichfork;
2194         ASSERT(w == XFS_DATA_FORK);
2195         mp = ip->i_mount;
2196         lastoff = mp->m_dirfreeblk;
2197         error = xfs_bmap_last_before(tp, ip, &lastoff, w);
2198         if (error)
2199                 return error;
2200         if (unlikely(lastoff == 0)) {
2201                 XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
2202                                  mp);
2203                 return XFS_ERROR(EFSCORRUPTED);
2204         }
2205         /*
2206          * Read the last block in the btree space.
2207          */
2208         last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
2209         error = xfs_da3_node_read(tp, ip, last_blkno, -1, &last_buf, w);
2210         if (error)
2211                 return error;
2212         /*
2213          * Copy the last block into the dead buffer and log it.
2214          */
2215         memcpy(dead_buf->b_addr, last_buf->b_addr, mp->m_dirblksize);
2216         xfs_trans_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1);
2217         dead_info = dead_buf->b_addr;
2218         /*
2219          * Get values from the moved block.
2220          */
2221         if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
2222             dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
2223                 struct xfs_dir3_icleaf_hdr leafhdr;
2224                 struct xfs_dir2_leaf_entry *ents;
2225
2226                 dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
2227                 xfs_dir3_leaf_hdr_from_disk(&leafhdr, dead_leaf2);
2228                 ents = xfs_dir3_leaf_ents_p(dead_leaf2);
2229                 dead_level = 0;
2230                 dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
2231         } else {
2232                 struct xfs_da3_icnode_hdr deadhdr;
2233
2234                 dead_node = (xfs_da_intnode_t *)dead_info;
2235                 xfs_da3_node_hdr_from_disk(&deadhdr, dead_node);
2236                 btree = xfs_da3_node_tree_p(dead_node);
2237                 dead_level = deadhdr.level;
2238                 dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
2239         }
2240         sib_buf = par_buf = NULL;
2241         /*
2242          * If the moved block has a left sibling, fix up the pointers.
2243          */
2244         if ((sib_blkno = be32_to_cpu(dead_info->back))) {
2245                 error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
2246                 if (error)
2247                         goto done;
2248                 sib_info = sib_buf->b_addr;
2249                 if (unlikely(
2250                     be32_to_cpu(sib_info->forw) != last_blkno ||
2251                     sib_info->magic != dead_info->magic)) {
2252                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
2253                                          XFS_ERRLEVEL_LOW, mp);
2254                         error = XFS_ERROR(EFSCORRUPTED);
2255                         goto done;
2256                 }
2257                 sib_info->forw = cpu_to_be32(dead_blkno);
2258                 xfs_trans_log_buf(tp, sib_buf,
2259                         XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
2260                                         sizeof(sib_info->forw)));
2261                 sib_buf = NULL;
2262         }
2263         /*
2264          * If the moved block has a right sibling, fix up the pointers.
2265          */
2266         if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
2267                 error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
2268                 if (error)
2269                         goto done;
2270                 sib_info = sib_buf->b_addr;
2271                 if (unlikely(
2272                        be32_to_cpu(sib_info->back) != last_blkno ||
2273                        sib_info->magic != dead_info->magic)) {
2274                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
2275                                          XFS_ERRLEVEL_LOW, mp);
2276                         error = XFS_ERROR(EFSCORRUPTED);
2277                         goto done;
2278                 }
2279                 sib_info->back = cpu_to_be32(dead_blkno);
2280                 xfs_trans_log_buf(tp, sib_buf,
2281                         XFS_DA_LOGRANGE(sib_info, &sib_info->back,
2282                                         sizeof(sib_info->back)));
2283                 sib_buf = NULL;
2284         }
2285         par_blkno = mp->m_dirleafblk;
2286         level = -1;
2287         /*
2288          * Walk down the tree looking for the parent of the moved block.
2289          */
2290         for (;;) {
2291                 error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w);
2292                 if (error)
2293                         goto done;
2294                 par_node = par_buf->b_addr;
2295                 xfs_da3_node_hdr_from_disk(&par_hdr, par_node);
2296                 if (level >= 0 && level != par_hdr.level + 1) {
2297                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
2298                                          XFS_ERRLEVEL_LOW, mp);
2299                         error = XFS_ERROR(EFSCORRUPTED);
2300                         goto done;
2301                 }
2302                 level = par_hdr.level;
2303                 btree = xfs_da3_node_tree_p(par_node);
2304                 for (entno = 0;
2305                      entno < par_hdr.count &&
2306                      be32_to_cpu(btree[entno].hashval) < dead_hash;
2307                      entno++)
2308                         continue;
2309                 if (entno == par_hdr.count) {
2310                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
2311                                          XFS_ERRLEVEL_LOW, mp);
2312                         error = XFS_ERROR(EFSCORRUPTED);
2313                         goto done;
2314                 }
2315                 par_blkno = be32_to_cpu(btree[entno].before);
2316                 if (level == dead_level + 1)
2317                         break;
2318                 xfs_trans_brelse(tp, par_buf);
2319                 par_buf = NULL;
2320         }
2321         /*
2322          * We're in the right parent block.
2323          * Look for the right entry.
2324          */
2325         for (;;) {
2326                 for (;
2327                      entno < par_hdr.count &&
2328                      be32_to_cpu(btree[entno].before) != last_blkno;
2329                      entno++)
2330                         continue;
2331                 if (entno < par_hdr.count)
2332                         break;
2333                 par_blkno = par_hdr.forw;
2334                 xfs_trans_brelse(tp, par_buf);
2335                 par_buf = NULL;
2336                 if (unlikely(par_blkno == 0)) {
2337                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
2338                                          XFS_ERRLEVEL_LOW, mp);
2339                         error = XFS_ERROR(EFSCORRUPTED);
2340                         goto done;
2341                 }
2342                 error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w);
2343                 if (error)
2344                         goto done;
2345                 par_node = par_buf->b_addr;
2346                 xfs_da3_node_hdr_from_disk(&par_hdr, par_node);
2347                 if (par_hdr.level != level) {
2348                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
2349                                          XFS_ERRLEVEL_LOW, mp);
2350                         error = XFS_ERROR(EFSCORRUPTED);
2351                         goto done;
2352                 }
2353                 btree = xfs_da3_node_tree_p(par_node);
2354                 entno = 0;
2355         }
2356         /*
2357          * Update the parent entry pointing to the moved block.
2358          */
2359         btree[entno].before = cpu_to_be32(dead_blkno);
2360         xfs_trans_log_buf(tp, par_buf,
2361                 XFS_DA_LOGRANGE(par_node, &btree[entno].before,
2362                                 sizeof(btree[entno].before)));
2363         *dead_blknop = last_blkno;
2364         *dead_bufp = last_buf;
2365         return 0;
2366 done:
2367         if (par_buf)
2368                 xfs_trans_brelse(tp, par_buf);
2369         if (sib_buf)
2370                 xfs_trans_brelse(tp, sib_buf);
2371         xfs_trans_brelse(tp, last_buf);
2372         return error;
2373 }
2374
2375 /*
2376  * Remove a btree block from a directory or attribute.
2377  */
2378 int
2379 xfs_da_shrink_inode(
2380         xfs_da_args_t   *args,
2381         xfs_dablk_t     dead_blkno,
2382         struct xfs_buf  *dead_buf)
2383 {
2384         xfs_inode_t *dp;
2385         int done, error, w, count;
2386         xfs_trans_t *tp;
2387         xfs_mount_t *mp;
2388
2389         trace_xfs_da_shrink_inode(args);
2390
2391         dp = args->dp;
2392         w = args->whichfork;
2393         tp = args->trans;
2394         mp = dp->i_mount;
2395         if (w == XFS_DATA_FORK)
2396                 count = mp->m_dirblkfsbs;
2397         else
2398                 count = 1;
2399         for (;;) {
2400                 /*
2401                  * Remove extents.  If we get ENOSPC for a dir we have to move
2402                  * the last block to the place we want to kill.
2403                  */
2404                 error = xfs_bunmapi(tp, dp, dead_blkno, count,
2405                                     xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
2406                                     0, args->firstblock, args->flist, &done);
2407                 if (error == ENOSPC) {
2408                         if (w != XFS_DATA_FORK)
2409                                 break;
2410                         error = xfs_da3_swap_lastblock(args, &dead_blkno,
2411                                                       &dead_buf);
2412                         if (error)
2413                                 break;
2414                 } else {
2415                         break;
2416                 }
2417         }
2418         xfs_trans_binval(tp, dead_buf);
2419         return error;
2420 }
2421
2422 /*
2423  * See if the mapping(s) for this btree block are valid, i.e.
2424  * don't contain holes, are logically contiguous, and cover the whole range.
2425  */
2426 STATIC int
2427 xfs_da_map_covers_blocks(
2428         int             nmap,
2429         xfs_bmbt_irec_t *mapp,
2430         xfs_dablk_t     bno,
2431         int             count)
2432 {
2433         int             i;
2434         xfs_fileoff_t   off;
2435
2436         for (i = 0, off = bno; i < nmap; i++) {
2437                 if (mapp[i].br_startblock == HOLESTARTBLOCK ||
2438                     mapp[i].br_startblock == DELAYSTARTBLOCK) {
2439                         return 0;
2440                 }
2441                 if (off != mapp[i].br_startoff) {
2442                         return 0;
2443                 }
2444                 off += mapp[i].br_blockcount;
2445         }
2446         return off == bno + count;
2447 }
2448
2449 /*
2450  * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map.
2451  *
2452  * For the single map case, it is assumed that the caller has provided a pointer
2453  * to a valid xfs_buf_map.  For the multiple map case, this function will
2454  * allocate the xfs_buf_map to hold all the maps and replace the caller's single
2455  * map pointer with the allocated map.
2456  */
2457 static int
2458 xfs_buf_map_from_irec(
2459         struct xfs_mount        *mp,
2460         struct xfs_buf_map      **mapp,
2461         int                     *nmaps,
2462         struct xfs_bmbt_irec    *irecs,
2463         int                     nirecs)
2464 {
2465         struct xfs_buf_map      *map;
2466         int                     i;
2467
2468         ASSERT(*nmaps == 1);
2469         ASSERT(nirecs >= 1);
2470
2471         if (nirecs > 1) {
2472                 map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
2473                                   KM_SLEEP | KM_NOFS);
2474                 if (!map)
2475                         return ENOMEM;
2476                 *mapp = map;
2477         }
2478
2479         *nmaps = nirecs;
2480         map = *mapp;
2481         for (i = 0; i < *nmaps; i++) {
2482                 ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK &&
2483                        irecs[i].br_startblock != HOLESTARTBLOCK);
2484                 map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
2485                 map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
2486         }
2487         return 0;
2488 }
2489
2490 /*
2491  * Map the block we are given ready for reading. There are three possible return
2492  * values:
2493  *      -1 - will be returned if we land in a hole and mappedbno == -2 so the
2494  *           caller knows not to execute a subsequent read.
2495  *       0 - if we mapped the block successfully
2496  *      >0 - positive error number if there was an error.
2497  */
2498 static int
2499 xfs_dabuf_map(
2500         struct xfs_trans        *trans,
2501         struct xfs_inode        *dp,
2502         xfs_dablk_t             bno,
2503         xfs_daddr_t             mappedbno,
2504         int                     whichfork,
2505         struct xfs_buf_map      **map,
2506         int                     *nmaps)
2507 {
2508         struct xfs_mount        *mp = dp->i_mount;
2509         int                     nfsb;
2510         int                     error = 0;
2511         struct xfs_bmbt_irec    irec;
2512         struct xfs_bmbt_irec    *irecs = &irec;
2513         int                     nirecs;
2514
2515         ASSERT(map && *map);
2516         ASSERT(*nmaps == 1);
2517
2518         nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1;
2519
2520         /*
2521          * Caller doesn't have a mapping.  -2 means don't complain
2522          * if we land in a hole.
2523          */
2524         if (mappedbno == -1 || mappedbno == -2) {
2525                 /*
2526                  * Optimize the one-block case.
2527                  */
2528                 if (nfsb != 1)
2529                         irecs = kmem_zalloc(sizeof(irec) * nfsb,
2530                                             KM_SLEEP | KM_NOFS);
2531
2532                 nirecs = nfsb;
2533                 error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
2534                                        &nirecs, xfs_bmapi_aflag(whichfork));
2535                 if (error)
2536                         goto out;
2537         } else {
2538                 irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
2539                 irecs->br_startoff = (xfs_fileoff_t)bno;
2540                 irecs->br_blockcount = nfsb;
2541                 irecs->br_state = 0;
2542                 nirecs = 1;
2543         }
2544
2545         if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
2546                 error = mappedbno == -2 ? -1 : XFS_ERROR(EFSCORRUPTED);
2547                 if (unlikely(error == EFSCORRUPTED)) {
2548                         if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
2549                                 int i;
2550                                 xfs_alert(mp, "%s: bno %lld dir: inode %lld",
2551                                         __func__, (long long)bno,
2552                                         (long long)dp->i_ino);
2553                                 for (i = 0; i < *nmaps; i++) {
2554                                         xfs_alert(mp,
2555 "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
2556                                                 i,
2557                                                 (long long)irecs[i].br_startoff,
2558                                                 (long long)irecs[i].br_startblock,
2559                                                 (long long)irecs[i].br_blockcount,
2560                                                 irecs[i].br_state);
2561                                 }
2562                         }
2563                         XFS_ERROR_REPORT("xfs_da_do_buf(1)",
2564                                          XFS_ERRLEVEL_LOW, mp);
2565                 }
2566                 goto out;
2567         }
2568         error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs);
2569 out:
2570         if (irecs != &irec)
2571                 kmem_free(irecs);
2572         return error;
2573 }
2574
2575 /*
2576  * Get a buffer for the dir/attr block.
2577  */
2578 int
2579 xfs_da_get_buf(
2580         struct xfs_trans        *trans,
2581         struct xfs_inode        *dp,
2582         xfs_dablk_t             bno,
2583         xfs_daddr_t             mappedbno,
2584         struct xfs_buf          **bpp,
2585         int                     whichfork)
2586 {
2587         struct xfs_buf          *bp;
2588         struct xfs_buf_map      map;
2589         struct xfs_buf_map      *mapp;
2590         int                     nmap;
2591         int                     error;
2592
2593         *bpp = NULL;
2594         mapp = &map;
2595         nmap = 1;
2596         error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
2597                                 &mapp, &nmap);
2598         if (error) {
2599                 /* mapping a hole is not an error, but we don't continue */
2600                 if (error == -1)
2601                         error = 0;
2602                 goto out_free;
2603         }
2604
2605         bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
2606                                     mapp, nmap, 0);
2607         error = bp ? bp->b_error : XFS_ERROR(EIO);
2608         if (error) {
2609                 xfs_trans_brelse(trans, bp);
2610                 goto out_free;
2611         }
2612
2613         *bpp = bp;
2614
2615 out_free:
2616         if (mapp != &map)
2617                 kmem_free(mapp);
2618
2619         return error;
2620 }
2621
2622 /*
2623  * Get a buffer for the dir/attr block, fill in the contents.
2624  */
2625 int
2626 xfs_da_read_buf(
2627         struct xfs_trans        *trans,
2628         struct xfs_inode        *dp,
2629         xfs_dablk_t             bno,
2630         xfs_daddr_t             mappedbno,
2631         struct xfs_buf          **bpp,
2632         int                     whichfork,
2633         const struct xfs_buf_ops *ops)
2634 {
2635         struct xfs_buf          *bp;
2636         struct xfs_buf_map      map;
2637         struct xfs_buf_map      *mapp;
2638         int                     nmap;
2639         int                     error;
2640
2641         *bpp = NULL;
2642         mapp = &map;
2643         nmap = 1;
2644         error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
2645                                 &mapp, &nmap);
2646         if (error) {
2647                 /* mapping a hole is not an error, but we don't continue */
2648                 if (error == -1)
2649                         error = 0;
2650                 goto out_free;
2651         }
2652
2653         error = xfs_trans_read_buf_map(dp->i_mount, trans,
2654                                         dp->i_mount->m_ddev_targp,
2655                                         mapp, nmap, 0, &bp, ops);
2656         if (error)
2657                 goto out_free;
2658
2659         if (whichfork == XFS_ATTR_FORK)
2660                 xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
2661         else
2662                 xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
2663
2664         /*
2665          * This verification code will be moved to a CRC verification callback
2666          * function so just leave it here unchanged until then.
2667          */
2668         {
2669                 xfs_dir2_data_hdr_t     *hdr = bp->b_addr;
2670                 xfs_dir2_free_t         *free = bp->b_addr;
2671                 xfs_da_blkinfo_t        *info = bp->b_addr;
2672                 uint                    magic, magic1;
2673                 struct xfs_mount        *mp = dp->i_mount;
2674
2675                 magic = be16_to_cpu(info->magic);
2676                 magic1 = be32_to_cpu(hdr->magic);
2677                 if (unlikely(
2678                     XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
2679                                    (magic != XFS_DA3_NODE_MAGIC) &&
2680                                    (magic != XFS_ATTR_LEAF_MAGIC) &&
2681                                    (magic != XFS_ATTR3_LEAF_MAGIC) &&
2682                                    (magic != XFS_DIR2_LEAF1_MAGIC) &&
2683                                    (magic != XFS_DIR3_LEAF1_MAGIC) &&
2684                                    (magic != XFS_DIR2_LEAFN_MAGIC) &&
2685                                    (magic != XFS_DIR3_LEAFN_MAGIC) &&
2686                                    (magic1 != XFS_DIR2_BLOCK_MAGIC) &&
2687                                    (magic1 != XFS_DIR3_BLOCK_MAGIC) &&
2688                                    (magic1 != XFS_DIR2_DATA_MAGIC) &&
2689                                    (magic1 != XFS_DIR3_DATA_MAGIC) &&
2690                                    (free->hdr.magic !=
2691                                         cpu_to_be32(XFS_DIR2_FREE_MAGIC)) &&
2692                                    (free->hdr.magic !=
2693                                         cpu_to_be32(XFS_DIR3_FREE_MAGIC)),
2694                                 mp, XFS_ERRTAG_DA_READ_BUF,
2695                                 XFS_RANDOM_DA_READ_BUF))) {
2696                         trace_xfs_da_btree_corrupt(bp, _RET_IP_);
2697                         XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)",
2698                                              XFS_ERRLEVEL_LOW, mp, info);
2699                         error = XFS_ERROR(EFSCORRUPTED);
2700                         xfs_trans_brelse(trans, bp);
2701                         goto out_free;
2702                 }
2703         }
2704         *bpp = bp;
2705 out_free:
2706         if (mapp != &map)
2707                 kmem_free(mapp);
2708
2709         return error;
2710 }
2711
2712 /*
2713  * Readahead the dir/attr block.
2714  */
2715 xfs_daddr_t
2716 xfs_da_reada_buf(
2717         struct xfs_trans        *trans,
2718         struct xfs_inode        *dp,
2719         xfs_dablk_t             bno,
2720         xfs_daddr_t             mappedbno,
2721         int                     whichfork,
2722         const struct xfs_buf_ops *ops)
2723 {
2724         struct xfs_buf_map      map;
2725         struct xfs_buf_map      *mapp;
2726         int                     nmap;
2727         int                     error;
2728
2729         mapp = &map;
2730         nmap = 1;
2731         error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
2732                                 &mapp, &nmap);
2733         if (error) {
2734                 /* mapping a hole is not an error, but we don't continue */
2735                 if (error == -1)
2736                         error = 0;
2737                 goto out_free;
2738         }
2739
2740         mappedbno = mapp[0].bm_bn;
2741         xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
2742
2743 out_free:
2744         if (mapp != &map)
2745                 kmem_free(mapp);
2746
2747         if (error)
2748                 return -1;
2749         return mappedbno;
2750 }