]> git.karo-electronics.de Git - mv-sheeva.git/blob - fs/ocfs2/alloc.c
47a6ce84e67f31b552cc71572d45b5a457fa2c18
[mv-sheeva.git] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31 #include <linux/quotaops.h>
32
33 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
34 #include <cluster/masklog.h>
35
36 #include "ocfs2.h"
37
38 #include "alloc.h"
39 #include "aops.h"
40 #include "blockcheck.h"
41 #include "dlmglue.h"
42 #include "extent_map.h"
43 #include "inode.h"
44 #include "journal.h"
45 #include "localalloc.h"
46 #include "suballoc.h"
47 #include "sysfile.h"
48 #include "file.h"
49 #include "super.h"
50 #include "uptodate.h"
51 #include "xattr.h"
52 #include "refcounttree.h"
53
54 #include "buffer_head_io.h"
55
56 enum ocfs2_contig_type {
57         CONTIG_NONE = 0,
58         CONTIG_LEFT,
59         CONTIG_RIGHT,
60         CONTIG_LEFTRIGHT,
61 };
62
63 static enum ocfs2_contig_type
64         ocfs2_extent_rec_contig(struct super_block *sb,
65                                 struct ocfs2_extent_rec *ext,
66                                 struct ocfs2_extent_rec *insert_rec);
67 /*
68  * Operations for a specific extent tree type.
69  *
70  * To implement an on-disk btree (extent tree) type in ocfs2, add
71  * an ocfs2_extent_tree_operations structure and the matching
72  * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
73  * for the allocation portion of the extent tree.
74  */
75 struct ocfs2_extent_tree_operations {
76         /*
77          * last_eb_blk is the block number of the right most leaf extent
78          * block.  Most on-disk structures containing an extent tree store
79          * this value for fast access.  The ->eo_set_last_eb_blk() and
80          * ->eo_get_last_eb_blk() operations access this value.  They are
81          *  both required.
82          */
83         void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
84                                    u64 blkno);
85         u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
86
87         /*
88          * The on-disk structure usually keeps track of how many total
89          * clusters are stored in this extent tree.  This function updates
90          * that value.  new_clusters is the delta, and must be
91          * added to the total.  Required.
92          */
93         void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
94                                    u32 new_clusters);
95
96         /*
97          * If this extent tree is supported by an extent map, insert
98          * a record into the map.
99          */
100         void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
101                                      struct ocfs2_extent_rec *rec);
102
103         /*
104          * If this extent tree is supported by an extent map, truncate the
105          * map to clusters,
106          */
107         void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
108                                        u32 clusters);
109
110         /*
111          * If ->eo_insert_check() exists, it is called before rec is
112          * inserted into the extent tree.  It is optional.
113          */
114         int (*eo_insert_check)(struct ocfs2_extent_tree *et,
115                                struct ocfs2_extent_rec *rec);
116         int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
117
118         /*
119          * --------------------------------------------------------------
120          * The remaining are internal to ocfs2_extent_tree and don't have
121          * accessor functions
122          */
123
124         /*
125          * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
126          * It is required.
127          */
128         void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
129
130         /*
131          * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
132          * it exists.  If it does not, et->et_max_leaf_clusters is set
133          * to 0 (unlimited).  Optional.
134          */
135         void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
136
137         /*
138          * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
139          * are contiguous or not. Optional. Don't need to set it if use
140          * ocfs2_extent_rec as the tree leaf.
141          */
142         enum ocfs2_contig_type
143                 (*eo_extent_contig)(struct ocfs2_extent_tree *et,
144                                     struct ocfs2_extent_rec *ext,
145                                     struct ocfs2_extent_rec *insert_rec);
146 };
147
148
149 /*
150  * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
151  * in the methods.
152  */
153 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
154 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
155                                          u64 blkno);
156 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
157                                          u32 clusters);
158 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
159                                            struct ocfs2_extent_rec *rec);
160 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
161                                              u32 clusters);
162 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
163                                      struct ocfs2_extent_rec *rec);
164 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
165 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
166 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
167         .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
168         .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
169         .eo_update_clusters     = ocfs2_dinode_update_clusters,
170         .eo_extent_map_insert   = ocfs2_dinode_extent_map_insert,
171         .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate,
172         .eo_insert_check        = ocfs2_dinode_insert_check,
173         .eo_sanity_check        = ocfs2_dinode_sanity_check,
174         .eo_fill_root_el        = ocfs2_dinode_fill_root_el,
175 };
176
177 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
178                                          u64 blkno)
179 {
180         struct ocfs2_dinode *di = et->et_object;
181
182         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
183         di->i_last_eb_blk = cpu_to_le64(blkno);
184 }
185
186 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
187 {
188         struct ocfs2_dinode *di = et->et_object;
189
190         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
191         return le64_to_cpu(di->i_last_eb_blk);
192 }
193
194 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
195                                          u32 clusters)
196 {
197         struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
198         struct ocfs2_dinode *di = et->et_object;
199
200         le32_add_cpu(&di->i_clusters, clusters);
201         spin_lock(&oi->ip_lock);
202         oi->ip_clusters = le32_to_cpu(di->i_clusters);
203         spin_unlock(&oi->ip_lock);
204 }
205
206 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
207                                            struct ocfs2_extent_rec *rec)
208 {
209         struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
210
211         ocfs2_extent_map_insert_rec(inode, rec);
212 }
213
214 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
215                                              u32 clusters)
216 {
217         struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
218
219         ocfs2_extent_map_trunc(inode, clusters);
220 }
221
222 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
223                                      struct ocfs2_extent_rec *rec)
224 {
225         struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
226         struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
227
228         BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
229         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
230                         (oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
231                         "Device %s, asking for sparse allocation: inode %llu, "
232                         "cpos %u, clusters %u\n",
233                         osb->dev_str,
234                         (unsigned long long)oi->ip_blkno,
235                         rec->e_cpos, oi->ip_clusters);
236
237         return 0;
238 }
239
240 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
241 {
242         struct ocfs2_dinode *di = et->et_object;
243
244         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
245         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
246
247         return 0;
248 }
249
250 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
251 {
252         struct ocfs2_dinode *di = et->et_object;
253
254         et->et_root_el = &di->id2.i_list;
255 }
256
257
258 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
259 {
260         struct ocfs2_xattr_value_buf *vb = et->et_object;
261
262         et->et_root_el = &vb->vb_xv->xr_list;
263 }
264
265 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
266                                               u64 blkno)
267 {
268         struct ocfs2_xattr_value_buf *vb = et->et_object;
269
270         vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
271 }
272
273 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
274 {
275         struct ocfs2_xattr_value_buf *vb = et->et_object;
276
277         return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
278 }
279
280 static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
281                                               u32 clusters)
282 {
283         struct ocfs2_xattr_value_buf *vb = et->et_object;
284
285         le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
286 }
287
288 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
289         .eo_set_last_eb_blk     = ocfs2_xattr_value_set_last_eb_blk,
290         .eo_get_last_eb_blk     = ocfs2_xattr_value_get_last_eb_blk,
291         .eo_update_clusters     = ocfs2_xattr_value_update_clusters,
292         .eo_fill_root_el        = ocfs2_xattr_value_fill_root_el,
293 };
294
295 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
296 {
297         struct ocfs2_xattr_block *xb = et->et_object;
298
299         et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
300 }
301
302 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
303 {
304         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
305         et->et_max_leaf_clusters =
306                 ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
307 }
308
309 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
310                                              u64 blkno)
311 {
312         struct ocfs2_xattr_block *xb = et->et_object;
313         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
314
315         xt->xt_last_eb_blk = cpu_to_le64(blkno);
316 }
317
318 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
319 {
320         struct ocfs2_xattr_block *xb = et->et_object;
321         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
322
323         return le64_to_cpu(xt->xt_last_eb_blk);
324 }
325
326 static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
327                                              u32 clusters)
328 {
329         struct ocfs2_xattr_block *xb = et->et_object;
330
331         le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
332 }
333
334 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
335         .eo_set_last_eb_blk     = ocfs2_xattr_tree_set_last_eb_blk,
336         .eo_get_last_eb_blk     = ocfs2_xattr_tree_get_last_eb_blk,
337         .eo_update_clusters     = ocfs2_xattr_tree_update_clusters,
338         .eo_fill_root_el        = ocfs2_xattr_tree_fill_root_el,
339         .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
340 };
341
342 static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
343                                           u64 blkno)
344 {
345         struct ocfs2_dx_root_block *dx_root = et->et_object;
346
347         dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
348 }
349
350 static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
351 {
352         struct ocfs2_dx_root_block *dx_root = et->et_object;
353
354         return le64_to_cpu(dx_root->dr_last_eb_blk);
355 }
356
357 static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
358                                           u32 clusters)
359 {
360         struct ocfs2_dx_root_block *dx_root = et->et_object;
361
362         le32_add_cpu(&dx_root->dr_clusters, clusters);
363 }
364
365 static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
366 {
367         struct ocfs2_dx_root_block *dx_root = et->et_object;
368
369         BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
370
371         return 0;
372 }
373
374 static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
375 {
376         struct ocfs2_dx_root_block *dx_root = et->et_object;
377
378         et->et_root_el = &dx_root->dr_list;
379 }
380
381 static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
382         .eo_set_last_eb_blk     = ocfs2_dx_root_set_last_eb_blk,
383         .eo_get_last_eb_blk     = ocfs2_dx_root_get_last_eb_blk,
384         .eo_update_clusters     = ocfs2_dx_root_update_clusters,
385         .eo_sanity_check        = ocfs2_dx_root_sanity_check,
386         .eo_fill_root_el        = ocfs2_dx_root_fill_root_el,
387 };
388
389 static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
390 {
391         struct ocfs2_refcount_block *rb = et->et_object;
392
393         et->et_root_el = &rb->rf_list;
394 }
395
396 static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
397                                                 u64 blkno)
398 {
399         struct ocfs2_refcount_block *rb = et->et_object;
400
401         rb->rf_last_eb_blk = cpu_to_le64(blkno);
402 }
403
404 static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
405 {
406         struct ocfs2_refcount_block *rb = et->et_object;
407
408         return le64_to_cpu(rb->rf_last_eb_blk);
409 }
410
411 static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
412                                                 u32 clusters)
413 {
414         struct ocfs2_refcount_block *rb = et->et_object;
415
416         le32_add_cpu(&rb->rf_clusters, clusters);
417 }
418
419 static enum ocfs2_contig_type
420 ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
421                                   struct ocfs2_extent_rec *ext,
422                                   struct ocfs2_extent_rec *insert_rec)
423 {
424         return CONTIG_NONE;
425 }
426
427 static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
428         .eo_set_last_eb_blk     = ocfs2_refcount_tree_set_last_eb_blk,
429         .eo_get_last_eb_blk     = ocfs2_refcount_tree_get_last_eb_blk,
430         .eo_update_clusters     = ocfs2_refcount_tree_update_clusters,
431         .eo_fill_root_el        = ocfs2_refcount_tree_fill_root_el,
432         .eo_extent_contig       = ocfs2_refcount_tree_extent_contig,
433 };
434
435 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
436                                      struct ocfs2_caching_info *ci,
437                                      struct buffer_head *bh,
438                                      ocfs2_journal_access_func access,
439                                      void *obj,
440                                      struct ocfs2_extent_tree_operations *ops)
441 {
442         et->et_ops = ops;
443         et->et_root_bh = bh;
444         et->et_ci = ci;
445         et->et_root_journal_access = access;
446         if (!obj)
447                 obj = (void *)bh->b_data;
448         et->et_object = obj;
449
450         et->et_ops->eo_fill_root_el(et);
451         if (!et->et_ops->eo_fill_max_leaf_clusters)
452                 et->et_max_leaf_clusters = 0;
453         else
454                 et->et_ops->eo_fill_max_leaf_clusters(et);
455 }
456
457 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
458                                    struct ocfs2_caching_info *ci,
459                                    struct buffer_head *bh)
460 {
461         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
462                                  NULL, &ocfs2_dinode_et_ops);
463 }
464
465 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
466                                        struct ocfs2_caching_info *ci,
467                                        struct buffer_head *bh)
468 {
469         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
470                                  NULL, &ocfs2_xattr_tree_et_ops);
471 }
472
473 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
474                                         struct ocfs2_caching_info *ci,
475                                         struct ocfs2_xattr_value_buf *vb)
476 {
477         __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
478                                  &ocfs2_xattr_value_et_ops);
479 }
480
481 void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
482                                     struct ocfs2_caching_info *ci,
483                                     struct buffer_head *bh)
484 {
485         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
486                                  NULL, &ocfs2_dx_root_et_ops);
487 }
488
489 void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
490                                      struct ocfs2_caching_info *ci,
491                                      struct buffer_head *bh)
492 {
493         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
494                                  NULL, &ocfs2_refcount_tree_et_ops);
495 }
496
497 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
498                                             u64 new_last_eb_blk)
499 {
500         et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
501 }
502
503 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
504 {
505         return et->et_ops->eo_get_last_eb_blk(et);
506 }
507
508 static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
509                                             u32 clusters)
510 {
511         et->et_ops->eo_update_clusters(et, clusters);
512 }
513
514 static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
515                                               struct ocfs2_extent_rec *rec)
516 {
517         if (et->et_ops->eo_extent_map_insert)
518                 et->et_ops->eo_extent_map_insert(et, rec);
519 }
520
521 static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
522                                                 u32 clusters)
523 {
524         if (et->et_ops->eo_extent_map_truncate)
525                 et->et_ops->eo_extent_map_truncate(et, clusters);
526 }
527
528 static inline int ocfs2_et_root_journal_access(handle_t *handle,
529                                                struct ocfs2_extent_tree *et,
530                                                int type)
531 {
532         return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
533                                           type);
534 }
535
536 static inline enum ocfs2_contig_type
537         ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
538                                struct ocfs2_extent_rec *rec,
539                                struct ocfs2_extent_rec *insert_rec)
540 {
541         if (et->et_ops->eo_extent_contig)
542                 return et->et_ops->eo_extent_contig(et, rec, insert_rec);
543
544         return ocfs2_extent_rec_contig(
545                                 ocfs2_metadata_cache_get_super(et->et_ci),
546                                 rec, insert_rec);
547 }
548
549 static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
550                                         struct ocfs2_extent_rec *rec)
551 {
552         int ret = 0;
553
554         if (et->et_ops->eo_insert_check)
555                 ret = et->et_ops->eo_insert_check(et, rec);
556         return ret;
557 }
558
559 static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
560 {
561         int ret = 0;
562
563         if (et->et_ops->eo_sanity_check)
564                 ret = et->et_ops->eo_sanity_check(et);
565         return ret;
566 }
567
568 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
569                                          struct ocfs2_extent_block *eb);
570 static void ocfs2_adjust_rightmost_records(handle_t *handle,
571                                            struct ocfs2_extent_tree *et,
572                                            struct ocfs2_path *path,
573                                            struct ocfs2_extent_rec *insert_rec);
574 /*
575  * Reset the actual path elements so that we can re-use the structure
576  * to build another path. Generally, this involves freeing the buffer
577  * heads.
578  */
579 void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
580 {
581         int i, start = 0, depth = 0;
582         struct ocfs2_path_item *node;
583
584         if (keep_root)
585                 start = 1;
586
587         for(i = start; i < path_num_items(path); i++) {
588                 node = &path->p_node[i];
589
590                 brelse(node->bh);
591                 node->bh = NULL;
592                 node->el = NULL;
593         }
594
595         /*
596          * Tree depth may change during truncate, or insert. If we're
597          * keeping the root extent list, then make sure that our path
598          * structure reflects the proper depth.
599          */
600         if (keep_root)
601                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
602         else
603                 path_root_access(path) = NULL;
604
605         path->p_tree_depth = depth;
606 }
607
608 void ocfs2_free_path(struct ocfs2_path *path)
609 {
610         if (path) {
611                 ocfs2_reinit_path(path, 0);
612                 kfree(path);
613         }
614 }
615
616 /*
617  * All the elements of src into dest. After this call, src could be freed
618  * without affecting dest.
619  *
620  * Both paths should have the same root. Any non-root elements of dest
621  * will be freed.
622  */
623 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
624 {
625         int i;
626
627         BUG_ON(path_root_bh(dest) != path_root_bh(src));
628         BUG_ON(path_root_el(dest) != path_root_el(src));
629         BUG_ON(path_root_access(dest) != path_root_access(src));
630
631         ocfs2_reinit_path(dest, 1);
632
633         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
634                 dest->p_node[i].bh = src->p_node[i].bh;
635                 dest->p_node[i].el = src->p_node[i].el;
636
637                 if (dest->p_node[i].bh)
638                         get_bh(dest->p_node[i].bh);
639         }
640 }
641
642 /*
643  * Make the *dest path the same as src and re-initialize src path to
644  * have a root only.
645  */
646 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
647 {
648         int i;
649
650         BUG_ON(path_root_bh(dest) != path_root_bh(src));
651         BUG_ON(path_root_access(dest) != path_root_access(src));
652
653         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
654                 brelse(dest->p_node[i].bh);
655
656                 dest->p_node[i].bh = src->p_node[i].bh;
657                 dest->p_node[i].el = src->p_node[i].el;
658
659                 src->p_node[i].bh = NULL;
660                 src->p_node[i].el = NULL;
661         }
662 }
663
664 /*
665  * Insert an extent block at given index.
666  *
667  * This will not take an additional reference on eb_bh.
668  */
669 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
670                                         struct buffer_head *eb_bh)
671 {
672         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
673
674         /*
675          * Right now, no root bh is an extent block, so this helps
676          * catch code errors with dinode trees. The assertion can be
677          * safely removed if we ever need to insert extent block
678          * structures at the root.
679          */
680         BUG_ON(index == 0);
681
682         path->p_node[index].bh = eb_bh;
683         path->p_node[index].el = &eb->h_list;
684 }
685
686 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
687                                          struct ocfs2_extent_list *root_el,
688                                          ocfs2_journal_access_func access)
689 {
690         struct ocfs2_path *path;
691
692         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
693
694         path = kzalloc(sizeof(*path), GFP_NOFS);
695         if (path) {
696                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
697                 get_bh(root_bh);
698                 path_root_bh(path) = root_bh;
699                 path_root_el(path) = root_el;
700                 path_root_access(path) = access;
701         }
702
703         return path;
704 }
705
706 struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
707 {
708         return ocfs2_new_path(path_root_bh(path), path_root_el(path),
709                               path_root_access(path));
710 }
711
712 struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
713 {
714         return ocfs2_new_path(et->et_root_bh, et->et_root_el,
715                               et->et_root_journal_access);
716 }
717
718 /*
719  * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
720  * otherwise it's the root_access function.
721  *
722  * I don't like the way this function's name looks next to
723  * ocfs2_journal_access_path(), but I don't have a better one.
724  */
725 int ocfs2_path_bh_journal_access(handle_t *handle,
726                                  struct ocfs2_caching_info *ci,
727                                  struct ocfs2_path *path,
728                                  int idx)
729 {
730         ocfs2_journal_access_func access = path_root_access(path);
731
732         if (!access)
733                 access = ocfs2_journal_access;
734
735         if (idx)
736                 access = ocfs2_journal_access_eb;
737
738         return access(handle, ci, path->p_node[idx].bh,
739                       OCFS2_JOURNAL_ACCESS_WRITE);
740 }
741
742 /*
743  * Convenience function to journal all components in a path.
744  */
745 int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
746                               handle_t *handle,
747                               struct ocfs2_path *path)
748 {
749         int i, ret = 0;
750
751         if (!path)
752                 goto out;
753
754         for(i = 0; i < path_num_items(path); i++) {
755                 ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
756                 if (ret < 0) {
757                         mlog_errno(ret);
758                         goto out;
759                 }
760         }
761
762 out:
763         return ret;
764 }
765
766 /*
767  * Return the index of the extent record which contains cluster #v_cluster.
768  * -1 is returned if it was not found.
769  *
770  * Should work fine on interior and exterior nodes.
771  */
772 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
773 {
774         int ret = -1;
775         int i;
776         struct ocfs2_extent_rec *rec;
777         u32 rec_end, rec_start, clusters;
778
779         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
780                 rec = &el->l_recs[i];
781
782                 rec_start = le32_to_cpu(rec->e_cpos);
783                 clusters = ocfs2_rec_clusters(el, rec);
784
785                 rec_end = rec_start + clusters;
786
787                 if (v_cluster >= rec_start && v_cluster < rec_end) {
788                         ret = i;
789                         break;
790                 }
791         }
792
793         return ret;
794 }
795
796 /*
797  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
798  * ocfs2_extent_rec_contig only work properly against leaf nodes!
799  */
800 static int ocfs2_block_extent_contig(struct super_block *sb,
801                                      struct ocfs2_extent_rec *ext,
802                                      u64 blkno)
803 {
804         u64 blk_end = le64_to_cpu(ext->e_blkno);
805
806         blk_end += ocfs2_clusters_to_blocks(sb,
807                                     le16_to_cpu(ext->e_leaf_clusters));
808
809         return blkno == blk_end;
810 }
811
812 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
813                                   struct ocfs2_extent_rec *right)
814 {
815         u32 left_range;
816
817         left_range = le32_to_cpu(left->e_cpos) +
818                 le16_to_cpu(left->e_leaf_clusters);
819
820         return (left_range == le32_to_cpu(right->e_cpos));
821 }
822
823 static enum ocfs2_contig_type
824         ocfs2_extent_rec_contig(struct super_block *sb,
825                                 struct ocfs2_extent_rec *ext,
826                                 struct ocfs2_extent_rec *insert_rec)
827 {
828         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
829
830         /*
831          * Refuse to coalesce extent records with different flag
832          * fields - we don't want to mix unwritten extents with user
833          * data.
834          */
835         if (ext->e_flags != insert_rec->e_flags)
836                 return CONTIG_NONE;
837
838         if (ocfs2_extents_adjacent(ext, insert_rec) &&
839             ocfs2_block_extent_contig(sb, ext, blkno))
840                         return CONTIG_RIGHT;
841
842         blkno = le64_to_cpu(ext->e_blkno);
843         if (ocfs2_extents_adjacent(insert_rec, ext) &&
844             ocfs2_block_extent_contig(sb, insert_rec, blkno))
845                 return CONTIG_LEFT;
846
847         return CONTIG_NONE;
848 }
849
850 /*
851  * NOTE: We can have pretty much any combination of contiguousness and
852  * appending.
853  *
854  * The usefulness of APPEND_TAIL is more in that it lets us know that
855  * we'll have to update the path to that leaf.
856  */
857 enum ocfs2_append_type {
858         APPEND_NONE = 0,
859         APPEND_TAIL,
860 };
861
862 enum ocfs2_split_type {
863         SPLIT_NONE = 0,
864         SPLIT_LEFT,
865         SPLIT_RIGHT,
866 };
867
868 struct ocfs2_insert_type {
869         enum ocfs2_split_type   ins_split;
870         enum ocfs2_append_type  ins_appending;
871         enum ocfs2_contig_type  ins_contig;
872         int                     ins_contig_index;
873         int                     ins_tree_depth;
874 };
875
876 struct ocfs2_merge_ctxt {
877         enum ocfs2_contig_type  c_contig_type;
878         int                     c_has_empty_extent;
879         int                     c_split_covers_rec;
880 };
881
882 static int ocfs2_validate_extent_block(struct super_block *sb,
883                                        struct buffer_head *bh)
884 {
885         int rc;
886         struct ocfs2_extent_block *eb =
887                 (struct ocfs2_extent_block *)bh->b_data;
888
889         mlog(0, "Validating extent block %llu\n",
890              (unsigned long long)bh->b_blocknr);
891
892         BUG_ON(!buffer_uptodate(bh));
893
894         /*
895          * If the ecc fails, we return the error but otherwise
896          * leave the filesystem running.  We know any error is
897          * local to this block.
898          */
899         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
900         if (rc) {
901                 mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
902                      (unsigned long long)bh->b_blocknr);
903                 return rc;
904         }
905
906         /*
907          * Errors after here are fatal.
908          */
909
910         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
911                 ocfs2_error(sb,
912                             "Extent block #%llu has bad signature %.*s",
913                             (unsigned long long)bh->b_blocknr, 7,
914                             eb->h_signature);
915                 return -EINVAL;
916         }
917
918         if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
919                 ocfs2_error(sb,
920                             "Extent block #%llu has an invalid h_blkno "
921                             "of %llu",
922                             (unsigned long long)bh->b_blocknr,
923                             (unsigned long long)le64_to_cpu(eb->h_blkno));
924                 return -EINVAL;
925         }
926
927         if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
928                 ocfs2_error(sb,
929                             "Extent block #%llu has an invalid "
930                             "h_fs_generation of #%u",
931                             (unsigned long long)bh->b_blocknr,
932                             le32_to_cpu(eb->h_fs_generation));
933                 return -EINVAL;
934         }
935
936         return 0;
937 }
938
939 int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
940                             struct buffer_head **bh)
941 {
942         int rc;
943         struct buffer_head *tmp = *bh;
944
945         rc = ocfs2_read_block(ci, eb_blkno, &tmp,
946                               ocfs2_validate_extent_block);
947
948         /* If ocfs2_read_block() got us a new bh, pass it up. */
949         if (!rc && !*bh)
950                 *bh = tmp;
951
952         return rc;
953 }
954
955
956 /*
957  * How many free extents have we got before we need more meta data?
958  */
959 int ocfs2_num_free_extents(struct ocfs2_super *osb,
960                            struct ocfs2_extent_tree *et)
961 {
962         int retval;
963         struct ocfs2_extent_list *el = NULL;
964         struct ocfs2_extent_block *eb;
965         struct buffer_head *eb_bh = NULL;
966         u64 last_eb_blk = 0;
967
968         el = et->et_root_el;
969         last_eb_blk = ocfs2_et_get_last_eb_blk(et);
970
971         if (last_eb_blk) {
972                 retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
973                                                  &eb_bh);
974                 if (retval < 0) {
975                         mlog_errno(retval);
976                         goto bail;
977                 }
978                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
979                 el = &eb->h_list;
980         }
981
982         BUG_ON(el->l_tree_depth != 0);
983
984         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
985 bail:
986         brelse(eb_bh);
987
988         mlog_exit(retval);
989         return retval;
990 }
991
992 /* expects array to already be allocated
993  *
994  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
995  * l_count for you
996  */
997 static int ocfs2_create_new_meta_bhs(handle_t *handle,
998                                      struct ocfs2_extent_tree *et,
999                                      int wanted,
1000                                      struct ocfs2_alloc_context *meta_ac,
1001                                      struct buffer_head *bhs[])
1002 {
1003         int count, status, i;
1004         u16 suballoc_bit_start;
1005         u32 num_got;
1006         u64 suballoc_loc, first_blkno;
1007         struct ocfs2_super *osb =
1008                 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
1009         struct ocfs2_extent_block *eb;
1010
1011         count = 0;
1012         while (count < wanted) {
1013                 status = ocfs2_claim_metadata(handle,
1014                                               meta_ac,
1015                                               wanted - count,
1016                                               &suballoc_loc,
1017                                               &suballoc_bit_start,
1018                                               &num_got,
1019                                               &first_blkno);
1020                 if (status < 0) {
1021                         mlog_errno(status);
1022                         goto bail;
1023                 }
1024
1025                 for(i = count;  i < (num_got + count); i++) {
1026                         bhs[i] = sb_getblk(osb->sb, first_blkno);
1027                         if (bhs[i] == NULL) {
1028                                 status = -EIO;
1029                                 mlog_errno(status);
1030                                 goto bail;
1031                         }
1032                         ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
1033
1034                         status = ocfs2_journal_access_eb(handle, et->et_ci,
1035                                                          bhs[i],
1036                                                          OCFS2_JOURNAL_ACCESS_CREATE);
1037                         if (status < 0) {
1038                                 mlog_errno(status);
1039                                 goto bail;
1040                         }
1041
1042                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
1043                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
1044                         /* Ok, setup the minimal stuff here. */
1045                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
1046                         eb->h_blkno = cpu_to_le64(first_blkno);
1047                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1048                         eb->h_suballoc_slot =
1049                                 cpu_to_le16(meta_ac->ac_alloc_slot);
1050                         eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
1051                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1052                         eb->h_list.l_count =
1053                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
1054
1055                         suballoc_bit_start++;
1056                         first_blkno++;
1057
1058                         /* We'll also be dirtied by the caller, so
1059                          * this isn't absolutely necessary. */
1060                         ocfs2_journal_dirty(handle, bhs[i]);
1061                 }
1062
1063                 count += num_got;
1064         }
1065
1066         status = 0;
1067 bail:
1068         if (status < 0) {
1069                 for(i = 0; i < wanted; i++) {
1070                         brelse(bhs[i]);
1071                         bhs[i] = NULL;
1072                 }
1073         }
1074         mlog_exit(status);
1075         return status;
1076 }
1077
1078 /*
1079  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
1080  *
1081  * Returns the sum of the rightmost extent rec logical offset and
1082  * cluster count.
1083  *
1084  * ocfs2_add_branch() uses this to determine what logical cluster
1085  * value should be populated into the leftmost new branch records.
1086  *
1087  * ocfs2_shift_tree_depth() uses this to determine the # clusters
1088  * value for the new topmost tree record.
1089  */
1090 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
1091 {
1092         int i;
1093
1094         i = le16_to_cpu(el->l_next_free_rec) - 1;
1095
1096         return le32_to_cpu(el->l_recs[i].e_cpos) +
1097                 ocfs2_rec_clusters(el, &el->l_recs[i]);
1098 }
1099
1100 /*
1101  * Change range of the branches in the right most path according to the leaf
1102  * extent block's rightmost record.
1103  */
1104 static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1105                                          struct ocfs2_extent_tree *et)
1106 {
1107         int status;
1108         struct ocfs2_path *path = NULL;
1109         struct ocfs2_extent_list *el;
1110         struct ocfs2_extent_rec *rec;
1111
1112         path = ocfs2_new_path_from_et(et);
1113         if (!path) {
1114                 status = -ENOMEM;
1115                 return status;
1116         }
1117
1118         status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
1119         if (status < 0) {
1120                 mlog_errno(status);
1121                 goto out;
1122         }
1123
1124         status = ocfs2_extend_trans(handle, path_num_items(path));
1125         if (status < 0) {
1126                 mlog_errno(status);
1127                 goto out;
1128         }
1129
1130         status = ocfs2_journal_access_path(et->et_ci, handle, path);
1131         if (status < 0) {
1132                 mlog_errno(status);
1133                 goto out;
1134         }
1135
1136         el = path_leaf_el(path);
1137         rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
1138
1139         ocfs2_adjust_rightmost_records(handle, et, path, rec);
1140
1141 out:
1142         ocfs2_free_path(path);
1143         return status;
1144 }
1145
1146 /*
1147  * Add an entire tree branch to our inode. eb_bh is the extent block
1148  * to start at, if we don't want to start the branch at the root
1149  * structure.
1150  *
1151  * last_eb_bh is required as we have to update it's next_leaf pointer
1152  * for the new last extent block.
1153  *
1154  * the new branch will be 'empty' in the sense that every block will
1155  * contain a single record with cluster count == 0.
1156  */
1157 static int ocfs2_add_branch(handle_t *handle,
1158                             struct ocfs2_extent_tree *et,
1159                             struct buffer_head *eb_bh,
1160                             struct buffer_head **last_eb_bh,
1161                             struct ocfs2_alloc_context *meta_ac)
1162 {
1163         int status, new_blocks, i;
1164         u64 next_blkno, new_last_eb_blk;
1165         struct buffer_head *bh;
1166         struct buffer_head **new_eb_bhs = NULL;
1167         struct ocfs2_extent_block *eb;
1168         struct ocfs2_extent_list  *eb_el;
1169         struct ocfs2_extent_list  *el;
1170         u32 new_cpos, root_end;
1171
1172         BUG_ON(!last_eb_bh || !*last_eb_bh);
1173
1174         if (eb_bh) {
1175                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1176                 el = &eb->h_list;
1177         } else
1178                 el = et->et_root_el;
1179
1180         /* we never add a branch to a leaf. */
1181         BUG_ON(!el->l_tree_depth);
1182
1183         new_blocks = le16_to_cpu(el->l_tree_depth);
1184
1185         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
1186         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
1187         root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
1188
1189         /*
1190          * If there is a gap before the root end and the real end
1191          * of the righmost leaf block, we need to remove the gap
1192          * between new_cpos and root_end first so that the tree
1193          * is consistent after we add a new branch(it will start
1194          * from new_cpos).
1195          */
1196         if (root_end > new_cpos) {
1197                 mlog(0, "adjust the cluster end from %u to %u\n",
1198                      root_end, new_cpos);
1199                 status = ocfs2_adjust_rightmost_branch(handle, et);
1200                 if (status) {
1201                         mlog_errno(status);
1202                         goto bail;
1203                 }
1204         }
1205
1206         /* allocate the number of new eb blocks we need */
1207         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
1208                              GFP_KERNEL);
1209         if (!new_eb_bhs) {
1210                 status = -ENOMEM;
1211                 mlog_errno(status);
1212                 goto bail;
1213         }
1214
1215         status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
1216                                            meta_ac, new_eb_bhs);
1217         if (status < 0) {
1218                 mlog_errno(status);
1219                 goto bail;
1220         }
1221
1222         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
1223          * linked with the rest of the tree.
1224          * conversly, new_eb_bhs[0] is the new bottommost leaf.
1225          *
1226          * when we leave the loop, new_last_eb_blk will point to the
1227          * newest leaf, and next_blkno will point to the topmost extent
1228          * block. */
1229         next_blkno = new_last_eb_blk = 0;
1230         for(i = 0; i < new_blocks; i++) {
1231                 bh = new_eb_bhs[i];
1232                 eb = (struct ocfs2_extent_block *) bh->b_data;
1233                 /* ocfs2_create_new_meta_bhs() should create it right! */
1234                 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1235                 eb_el = &eb->h_list;
1236
1237                 status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
1238                                                  OCFS2_JOURNAL_ACCESS_CREATE);
1239                 if (status < 0) {
1240                         mlog_errno(status);
1241                         goto bail;
1242                 }
1243
1244                 eb->h_next_leaf_blk = 0;
1245                 eb_el->l_tree_depth = cpu_to_le16(i);
1246                 eb_el->l_next_free_rec = cpu_to_le16(1);
1247                 /*
1248                  * This actually counts as an empty extent as
1249                  * c_clusters == 0
1250                  */
1251                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
1252                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
1253                 /*
1254                  * eb_el isn't always an interior node, but even leaf
1255                  * nodes want a zero'd flags and reserved field so
1256                  * this gets the whole 32 bits regardless of use.
1257                  */
1258                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
1259                 if (!eb_el->l_tree_depth)
1260                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1261
1262                 ocfs2_journal_dirty(handle, bh);
1263                 next_blkno = le64_to_cpu(eb->h_blkno);
1264         }
1265
1266         /* This is a bit hairy. We want to update up to three blocks
1267          * here without leaving any of them in an inconsistent state
1268          * in case of error. We don't have to worry about
1269          * journal_dirty erroring as it won't unless we've aborted the
1270          * handle (in which case we would never be here) so reserving
1271          * the write with journal_access is all we need to do. */
1272         status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
1273                                          OCFS2_JOURNAL_ACCESS_WRITE);
1274         if (status < 0) {
1275                 mlog_errno(status);
1276                 goto bail;
1277         }
1278         status = ocfs2_et_root_journal_access(handle, et,
1279                                               OCFS2_JOURNAL_ACCESS_WRITE);
1280         if (status < 0) {
1281                 mlog_errno(status);
1282                 goto bail;
1283         }
1284         if (eb_bh) {
1285                 status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
1286                                                  OCFS2_JOURNAL_ACCESS_WRITE);
1287                 if (status < 0) {
1288                         mlog_errno(status);
1289                         goto bail;
1290                 }
1291         }
1292
1293         /* Link the new branch into the rest of the tree (el will
1294          * either be on the root_bh, or the extent block passed in. */
1295         i = le16_to_cpu(el->l_next_free_rec);
1296         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1297         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1298         el->l_recs[i].e_int_clusters = 0;
1299         le16_add_cpu(&el->l_next_free_rec, 1);
1300
1301         /* fe needs a new last extent block pointer, as does the
1302          * next_leaf on the previously last-extent-block. */
1303         ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1304
1305         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1306         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1307
1308         ocfs2_journal_dirty(handle, *last_eb_bh);
1309         ocfs2_journal_dirty(handle, et->et_root_bh);
1310         if (eb_bh)
1311                 ocfs2_journal_dirty(handle, eb_bh);
1312
1313         /*
1314          * Some callers want to track the rightmost leaf so pass it
1315          * back here.
1316          */
1317         brelse(*last_eb_bh);
1318         get_bh(new_eb_bhs[0]);
1319         *last_eb_bh = new_eb_bhs[0];
1320
1321         status = 0;
1322 bail:
1323         if (new_eb_bhs) {
1324                 for (i = 0; i < new_blocks; i++)
1325                         brelse(new_eb_bhs[i]);
1326                 kfree(new_eb_bhs);
1327         }
1328
1329         mlog_exit(status);
1330         return status;
1331 }
1332
1333 /*
1334  * adds another level to the allocation tree.
1335  * returns back the new extent block so you can add a branch to it
1336  * after this call.
1337  */
1338 static int ocfs2_shift_tree_depth(handle_t *handle,
1339                                   struct ocfs2_extent_tree *et,
1340                                   struct ocfs2_alloc_context *meta_ac,
1341                                   struct buffer_head **ret_new_eb_bh)
1342 {
1343         int status, i;
1344         u32 new_clusters;
1345         struct buffer_head *new_eb_bh = NULL;
1346         struct ocfs2_extent_block *eb;
1347         struct ocfs2_extent_list  *root_el;
1348         struct ocfs2_extent_list  *eb_el;
1349
1350         status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
1351                                            &new_eb_bh);
1352         if (status < 0) {
1353                 mlog_errno(status);
1354                 goto bail;
1355         }
1356
1357         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1358         /* ocfs2_create_new_meta_bhs() should create it right! */
1359         BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1360
1361         eb_el = &eb->h_list;
1362         root_el = et->et_root_el;
1363
1364         status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
1365                                          OCFS2_JOURNAL_ACCESS_CREATE);
1366         if (status < 0) {
1367                 mlog_errno(status);
1368                 goto bail;
1369         }
1370
1371         /* copy the root extent list data into the new extent block */
1372         eb_el->l_tree_depth = root_el->l_tree_depth;
1373         eb_el->l_next_free_rec = root_el->l_next_free_rec;
1374         for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1375                 eb_el->l_recs[i] = root_el->l_recs[i];
1376
1377         ocfs2_journal_dirty(handle, new_eb_bh);
1378
1379         status = ocfs2_et_root_journal_access(handle, et,
1380                                               OCFS2_JOURNAL_ACCESS_WRITE);
1381         if (status < 0) {
1382                 mlog_errno(status);
1383                 goto bail;
1384         }
1385
1386         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1387
1388         /* update root_bh now */
1389         le16_add_cpu(&root_el->l_tree_depth, 1);
1390         root_el->l_recs[0].e_cpos = 0;
1391         root_el->l_recs[0].e_blkno = eb->h_blkno;
1392         root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1393         for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1394                 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1395         root_el->l_next_free_rec = cpu_to_le16(1);
1396
1397         /* If this is our 1st tree depth shift, then last_eb_blk
1398          * becomes the allocated extent block */
1399         if (root_el->l_tree_depth == cpu_to_le16(1))
1400                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1401
1402         ocfs2_journal_dirty(handle, et->et_root_bh);
1403
1404         *ret_new_eb_bh = new_eb_bh;
1405         new_eb_bh = NULL;
1406         status = 0;
1407 bail:
1408         brelse(new_eb_bh);
1409
1410         mlog_exit(status);
1411         return status;
1412 }
1413
1414 /*
1415  * Should only be called when there is no space left in any of the
1416  * leaf nodes. What we want to do is find the lowest tree depth
1417  * non-leaf extent block with room for new records. There are three
1418  * valid results of this search:
1419  *
1420  * 1) a lowest extent block is found, then we pass it back in
1421  *    *lowest_eb_bh and return '0'
1422  *
1423  * 2) the search fails to find anything, but the root_el has room. We
1424  *    pass NULL back in *lowest_eb_bh, but still return '0'
1425  *
1426  * 3) the search fails to find anything AND the root_el is full, in
1427  *    which case we return > 0
1428  *
1429  * return status < 0 indicates an error.
1430  */
1431 static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1432                                     struct buffer_head **target_bh)
1433 {
1434         int status = 0, i;
1435         u64 blkno;
1436         struct ocfs2_extent_block *eb;
1437         struct ocfs2_extent_list  *el;
1438         struct buffer_head *bh = NULL;
1439         struct buffer_head *lowest_bh = NULL;
1440
1441         *target_bh = NULL;
1442
1443         el = et->et_root_el;
1444
1445         while(le16_to_cpu(el->l_tree_depth) > 1) {
1446                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1447                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1448                                     "Owner %llu has empty "
1449                                     "extent list (next_free_rec == 0)",
1450                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
1451                         status = -EIO;
1452                         goto bail;
1453                 }
1454                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1455                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1456                 if (!blkno) {
1457                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1458                                     "Owner %llu has extent "
1459                                     "list where extent # %d has no physical "
1460                                     "block start",
1461                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
1462                         status = -EIO;
1463                         goto bail;
1464                 }
1465
1466                 brelse(bh);
1467                 bh = NULL;
1468
1469                 status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
1470                 if (status < 0) {
1471                         mlog_errno(status);
1472                         goto bail;
1473                 }
1474
1475                 eb = (struct ocfs2_extent_block *) bh->b_data;
1476                 el = &eb->h_list;
1477
1478                 if (le16_to_cpu(el->l_next_free_rec) <
1479                     le16_to_cpu(el->l_count)) {
1480                         brelse(lowest_bh);
1481                         lowest_bh = bh;
1482                         get_bh(lowest_bh);
1483                 }
1484         }
1485
1486         /* If we didn't find one and the fe doesn't have any room,
1487          * then return '1' */
1488         el = et->et_root_el;
1489         if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1490                 status = 1;
1491
1492         *target_bh = lowest_bh;
1493 bail:
1494         brelse(bh);
1495
1496         mlog_exit(status);
1497         return status;
1498 }
1499
1500 /*
1501  * Grow a b-tree so that it has more records.
1502  *
1503  * We might shift the tree depth in which case existing paths should
1504  * be considered invalid.
1505  *
1506  * Tree depth after the grow is returned via *final_depth.
1507  *
1508  * *last_eb_bh will be updated by ocfs2_add_branch().
1509  */
1510 static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
1511                            int *final_depth, struct buffer_head **last_eb_bh,
1512                            struct ocfs2_alloc_context *meta_ac)
1513 {
1514         int ret, shift;
1515         struct ocfs2_extent_list *el = et->et_root_el;
1516         int depth = le16_to_cpu(el->l_tree_depth);
1517         struct buffer_head *bh = NULL;
1518
1519         BUG_ON(meta_ac == NULL);
1520
1521         shift = ocfs2_find_branch_target(et, &bh);
1522         if (shift < 0) {
1523                 ret = shift;
1524                 mlog_errno(ret);
1525                 goto out;
1526         }
1527
1528         /* We traveled all the way to the bottom of the allocation tree
1529          * and didn't find room for any more extents - we need to add
1530          * another tree level */
1531         if (shift) {
1532                 BUG_ON(bh);
1533                 mlog(0, "need to shift tree depth (current = %d)\n", depth);
1534
1535                 /* ocfs2_shift_tree_depth will return us a buffer with
1536                  * the new extent block (so we can pass that to
1537                  * ocfs2_add_branch). */
1538                 ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
1539                 if (ret < 0) {
1540                         mlog_errno(ret);
1541                         goto out;
1542                 }
1543                 depth++;
1544                 if (depth == 1) {
1545                         /*
1546                          * Special case: we have room now if we shifted from
1547                          * tree_depth 0, so no more work needs to be done.
1548                          *
1549                          * We won't be calling add_branch, so pass
1550                          * back *last_eb_bh as the new leaf. At depth
1551                          * zero, it should always be null so there's
1552                          * no reason to brelse.
1553                          */
1554                         BUG_ON(*last_eb_bh);
1555                         get_bh(bh);
1556                         *last_eb_bh = bh;
1557                         goto out;
1558                 }
1559         }
1560
1561         /* call ocfs2_add_branch to add the final part of the tree with
1562          * the new data. */
1563         mlog(0, "add branch. bh = %p\n", bh);
1564         ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
1565                                meta_ac);
1566         if (ret < 0) {
1567                 mlog_errno(ret);
1568                 goto out;
1569         }
1570
1571 out:
1572         if (final_depth)
1573                 *final_depth = depth;
1574         brelse(bh);
1575         return ret;
1576 }
1577
1578 /*
1579  * This function will discard the rightmost extent record.
1580  */
1581 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1582 {
1583         int next_free = le16_to_cpu(el->l_next_free_rec);
1584         int count = le16_to_cpu(el->l_count);
1585         unsigned int num_bytes;
1586
1587         BUG_ON(!next_free);
1588         /* This will cause us to go off the end of our extent list. */
1589         BUG_ON(next_free >= count);
1590
1591         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1592
1593         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1594 }
1595
1596 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1597                               struct ocfs2_extent_rec *insert_rec)
1598 {
1599         int i, insert_index, next_free, has_empty, num_bytes;
1600         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1601         struct ocfs2_extent_rec *rec;
1602
1603         next_free = le16_to_cpu(el->l_next_free_rec);
1604         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1605
1606         BUG_ON(!next_free);
1607
1608         /* The tree code before us didn't allow enough room in the leaf. */
1609         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1610
1611         /*
1612          * The easiest way to approach this is to just remove the
1613          * empty extent and temporarily decrement next_free.
1614          */
1615         if (has_empty) {
1616                 /*
1617                  * If next_free was 1 (only an empty extent), this
1618                  * loop won't execute, which is fine. We still want
1619                  * the decrement above to happen.
1620                  */
1621                 for(i = 0; i < (next_free - 1); i++)
1622                         el->l_recs[i] = el->l_recs[i+1];
1623
1624                 next_free--;
1625         }
1626
1627         /*
1628          * Figure out what the new record index should be.
1629          */
1630         for(i = 0; i < next_free; i++) {
1631                 rec = &el->l_recs[i];
1632
1633                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1634                         break;
1635         }
1636         insert_index = i;
1637
1638         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1639              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1640
1641         BUG_ON(insert_index < 0);
1642         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1643         BUG_ON(insert_index > next_free);
1644
1645         /*
1646          * No need to memmove if we're just adding to the tail.
1647          */
1648         if (insert_index != next_free) {
1649                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1650
1651                 num_bytes = next_free - insert_index;
1652                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1653                 memmove(&el->l_recs[insert_index + 1],
1654                         &el->l_recs[insert_index],
1655                         num_bytes);
1656         }
1657
1658         /*
1659          * Either we had an empty extent, and need to re-increment or
1660          * there was no empty extent on a non full rightmost leaf node,
1661          * in which case we still need to increment.
1662          */
1663         next_free++;
1664         el->l_next_free_rec = cpu_to_le16(next_free);
1665         /*
1666          * Make sure none of the math above just messed up our tree.
1667          */
1668         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1669
1670         el->l_recs[insert_index] = *insert_rec;
1671
1672 }
1673
1674 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1675 {
1676         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1677
1678         BUG_ON(num_recs == 0);
1679
1680         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1681                 num_recs--;
1682                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1683                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1684                 memset(&el->l_recs[num_recs], 0,
1685                        sizeof(struct ocfs2_extent_rec));
1686                 el->l_next_free_rec = cpu_to_le16(num_recs);
1687         }
1688 }
1689
1690 /*
1691  * Create an empty extent record .
1692  *
1693  * l_next_free_rec may be updated.
1694  *
1695  * If an empty extent already exists do nothing.
1696  */
1697 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1698 {
1699         int next_free = le16_to_cpu(el->l_next_free_rec);
1700
1701         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1702
1703         if (next_free == 0)
1704                 goto set_and_inc;
1705
1706         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1707                 return;
1708
1709         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1710                         "Asked to create an empty extent in a full list:\n"
1711                         "count = %u, tree depth = %u",
1712                         le16_to_cpu(el->l_count),
1713                         le16_to_cpu(el->l_tree_depth));
1714
1715         ocfs2_shift_records_right(el);
1716
1717 set_and_inc:
1718         le16_add_cpu(&el->l_next_free_rec, 1);
1719         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1720 }
1721
1722 /*
1723  * For a rotation which involves two leaf nodes, the "root node" is
1724  * the lowest level tree node which contains a path to both leafs. This
1725  * resulting set of information can be used to form a complete "subtree"
1726  *
1727  * This function is passed two full paths from the dinode down to a
1728  * pair of adjacent leaves. It's task is to figure out which path
1729  * index contains the subtree root - this can be the root index itself
1730  * in a worst-case rotation.
1731  *
1732  * The array index of the subtree root is passed back.
1733  */
1734 int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
1735                             struct ocfs2_path *left,
1736                             struct ocfs2_path *right)
1737 {
1738         int i = 0;
1739
1740         /*
1741          * Check that the caller passed in two paths from the same tree.
1742          */
1743         BUG_ON(path_root_bh(left) != path_root_bh(right));
1744
1745         do {
1746                 i++;
1747
1748                 /*
1749                  * The caller didn't pass two adjacent paths.
1750                  */
1751                 mlog_bug_on_msg(i > left->p_tree_depth,
1752                                 "Owner %llu, left depth %u, right depth %u\n"
1753                                 "left leaf blk %llu, right leaf blk %llu\n",
1754                                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
1755                                 left->p_tree_depth, right->p_tree_depth,
1756                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1757                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1758         } while (left->p_node[i].bh->b_blocknr ==
1759                  right->p_node[i].bh->b_blocknr);
1760
1761         return i - 1;
1762 }
1763
1764 typedef void (path_insert_t)(void *, struct buffer_head *);
1765
1766 /*
1767  * Traverse a btree path in search of cpos, starting at root_el.
1768  *
1769  * This code can be called with a cpos larger than the tree, in which
1770  * case it will return the rightmost path.
1771  */
1772 static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
1773                              struct ocfs2_extent_list *root_el, u32 cpos,
1774                              path_insert_t *func, void *data)
1775 {
1776         int i, ret = 0;
1777         u32 range;
1778         u64 blkno;
1779         struct buffer_head *bh = NULL;
1780         struct ocfs2_extent_block *eb;
1781         struct ocfs2_extent_list *el;
1782         struct ocfs2_extent_rec *rec;
1783
1784         el = root_el;
1785         while (el->l_tree_depth) {
1786                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1787                         ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1788                                     "Owner %llu has empty extent list at "
1789                                     "depth %u\n",
1790                                     (unsigned long long)ocfs2_metadata_cache_owner(ci),
1791                                     le16_to_cpu(el->l_tree_depth));
1792                         ret = -EROFS;
1793                         goto out;
1794
1795                 }
1796
1797                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1798                         rec = &el->l_recs[i];
1799
1800                         /*
1801                          * In the case that cpos is off the allocation
1802                          * tree, this should just wind up returning the
1803                          * rightmost record.
1804                          */
1805                         range = le32_to_cpu(rec->e_cpos) +
1806                                 ocfs2_rec_clusters(el, rec);
1807                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1808                             break;
1809                 }
1810
1811                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1812                 if (blkno == 0) {
1813                         ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1814                                     "Owner %llu has bad blkno in extent list "
1815                                     "at depth %u (index %d)\n",
1816                                     (unsigned long long)ocfs2_metadata_cache_owner(ci),
1817                                     le16_to_cpu(el->l_tree_depth), i);
1818                         ret = -EROFS;
1819                         goto out;
1820                 }
1821
1822                 brelse(bh);
1823                 bh = NULL;
1824                 ret = ocfs2_read_extent_block(ci, blkno, &bh);
1825                 if (ret) {
1826                         mlog_errno(ret);
1827                         goto out;
1828                 }
1829
1830                 eb = (struct ocfs2_extent_block *) bh->b_data;
1831                 el = &eb->h_list;
1832
1833                 if (le16_to_cpu(el->l_next_free_rec) >
1834                     le16_to_cpu(el->l_count)) {
1835                         ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1836                                     "Owner %llu has bad count in extent list "
1837                                     "at block %llu (next free=%u, count=%u)\n",
1838                                     (unsigned long long)ocfs2_metadata_cache_owner(ci),
1839                                     (unsigned long long)bh->b_blocknr,
1840                                     le16_to_cpu(el->l_next_free_rec),
1841                                     le16_to_cpu(el->l_count));
1842                         ret = -EROFS;
1843                         goto out;
1844                 }
1845
1846                 if (func)
1847                         func(data, bh);
1848         }
1849
1850 out:
1851         /*
1852          * Catch any trailing bh that the loop didn't handle.
1853          */
1854         brelse(bh);
1855
1856         return ret;
1857 }
1858
1859 /*
1860  * Given an initialized path (that is, it has a valid root extent
1861  * list), this function will traverse the btree in search of the path
1862  * which would contain cpos.
1863  *
1864  * The path traveled is recorded in the path structure.
1865  *
1866  * Note that this will not do any comparisons on leaf node extent
1867  * records, so it will work fine in the case that we just added a tree
1868  * branch.
1869  */
1870 struct find_path_data {
1871         int index;
1872         struct ocfs2_path *path;
1873 };
1874 static void find_path_ins(void *data, struct buffer_head *bh)
1875 {
1876         struct find_path_data *fp = data;
1877
1878         get_bh(bh);
1879         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1880         fp->index++;
1881 }
1882 int ocfs2_find_path(struct ocfs2_caching_info *ci,
1883                     struct ocfs2_path *path, u32 cpos)
1884 {
1885         struct find_path_data data;
1886
1887         data.index = 1;
1888         data.path = path;
1889         return __ocfs2_find_path(ci, path_root_el(path), cpos,
1890                                  find_path_ins, &data);
1891 }
1892
1893 static void find_leaf_ins(void *data, struct buffer_head *bh)
1894 {
1895         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1896         struct ocfs2_extent_list *el = &eb->h_list;
1897         struct buffer_head **ret = data;
1898
1899         /* We want to retain only the leaf block. */
1900         if (le16_to_cpu(el->l_tree_depth) == 0) {
1901                 get_bh(bh);
1902                 *ret = bh;
1903         }
1904 }
1905 /*
1906  * Find the leaf block in the tree which would contain cpos. No
1907  * checking of the actual leaf is done.
1908  *
1909  * Some paths want to call this instead of allocating a path structure
1910  * and calling ocfs2_find_path().
1911  *
1912  * This function doesn't handle non btree extent lists.
1913  */
1914 int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
1915                     struct ocfs2_extent_list *root_el, u32 cpos,
1916                     struct buffer_head **leaf_bh)
1917 {
1918         int ret;
1919         struct buffer_head *bh = NULL;
1920
1921         ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
1922         if (ret) {
1923                 mlog_errno(ret);
1924                 goto out;
1925         }
1926
1927         *leaf_bh = bh;
1928 out:
1929         return ret;
1930 }
1931
1932 /*
1933  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1934  *
1935  * Basically, we've moved stuff around at the bottom of the tree and
1936  * we need to fix up the extent records above the changes to reflect
1937  * the new changes.
1938  *
1939  * left_rec: the record on the left.
1940  * left_child_el: is the child list pointed to by left_rec
1941  * right_rec: the record to the right of left_rec
1942  * right_child_el: is the child list pointed to by right_rec
1943  *
1944  * By definition, this only works on interior nodes.
1945  */
1946 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1947                                   struct ocfs2_extent_list *left_child_el,
1948                                   struct ocfs2_extent_rec *right_rec,
1949                                   struct ocfs2_extent_list *right_child_el)
1950 {
1951         u32 left_clusters, right_end;
1952
1953         /*
1954          * Interior nodes never have holes. Their cpos is the cpos of
1955          * the leftmost record in their child list. Their cluster
1956          * count covers the full theoretical range of their child list
1957          * - the range between their cpos and the cpos of the record
1958          * immediately to their right.
1959          */
1960         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1961         if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
1962                 BUG_ON(right_child_el->l_tree_depth);
1963                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1964                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1965         }
1966         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1967         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1968
1969         /*
1970          * Calculate the rightmost cluster count boundary before
1971          * moving cpos - we will need to adjust clusters after
1972          * updating e_cpos to keep the same highest cluster count.
1973          */
1974         right_end = le32_to_cpu(right_rec->e_cpos);
1975         right_end += le32_to_cpu(right_rec->e_int_clusters);
1976
1977         right_rec->e_cpos = left_rec->e_cpos;
1978         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1979
1980         right_end -= le32_to_cpu(right_rec->e_cpos);
1981         right_rec->e_int_clusters = cpu_to_le32(right_end);
1982 }
1983
1984 /*
1985  * Adjust the adjacent root node records involved in a
1986  * rotation. left_el_blkno is passed in as a key so that we can easily
1987  * find it's index in the root list.
1988  */
1989 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1990                                       struct ocfs2_extent_list *left_el,
1991                                       struct ocfs2_extent_list *right_el,
1992                                       u64 left_el_blkno)
1993 {
1994         int i;
1995
1996         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1997                le16_to_cpu(left_el->l_tree_depth));
1998
1999         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
2000                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
2001                         break;
2002         }
2003
2004         /*
2005          * The path walking code should have never returned a root and
2006          * two paths which are not adjacent.
2007          */
2008         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
2009
2010         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
2011                                       &root_el->l_recs[i + 1], right_el);
2012 }
2013
2014 /*
2015  * We've changed a leaf block (in right_path) and need to reflect that
2016  * change back up the subtree.
2017  *
2018  * This happens in multiple places:
2019  *   - When we've moved an extent record from the left path leaf to the right
2020  *     path leaf to make room for an empty extent in the left path leaf.
2021  *   - When our insert into the right path leaf is at the leftmost edge
2022  *     and requires an update of the path immediately to it's left. This
2023  *     can occur at the end of some types of rotation and appending inserts.
2024  *   - When we've adjusted the last extent record in the left path leaf and the
2025  *     1st extent record in the right path leaf during cross extent block merge.
2026  */
2027 static void ocfs2_complete_edge_insert(handle_t *handle,
2028                                        struct ocfs2_path *left_path,
2029                                        struct ocfs2_path *right_path,
2030                                        int subtree_index)
2031 {
2032         int i, idx;
2033         struct ocfs2_extent_list *el, *left_el, *right_el;
2034         struct ocfs2_extent_rec *left_rec, *right_rec;
2035         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2036
2037         /*
2038          * Update the counts and position values within all the
2039          * interior nodes to reflect the leaf rotation we just did.
2040          *
2041          * The root node is handled below the loop.
2042          *
2043          * We begin the loop with right_el and left_el pointing to the
2044          * leaf lists and work our way up.
2045          *
2046          * NOTE: within this loop, left_el and right_el always refer
2047          * to the *child* lists.
2048          */
2049         left_el = path_leaf_el(left_path);
2050         right_el = path_leaf_el(right_path);
2051         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
2052                 mlog(0, "Adjust records at index %u\n", i);
2053
2054                 /*
2055                  * One nice property of knowing that all of these
2056                  * nodes are below the root is that we only deal with
2057                  * the leftmost right node record and the rightmost
2058                  * left node record.
2059                  */
2060                 el = left_path->p_node[i].el;
2061                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
2062                 left_rec = &el->l_recs[idx];
2063
2064                 el = right_path->p_node[i].el;
2065                 right_rec = &el->l_recs[0];
2066
2067                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
2068                                               right_el);
2069
2070                 ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
2071                 ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2072
2073                 /*
2074                  * Setup our list pointers now so that the current
2075                  * parents become children in the next iteration.
2076                  */
2077                 left_el = left_path->p_node[i].el;
2078                 right_el = right_path->p_node[i].el;
2079         }
2080
2081         /*
2082          * At the root node, adjust the two adjacent records which
2083          * begin our path to the leaves.
2084          */
2085
2086         el = left_path->p_node[subtree_index].el;
2087         left_el = left_path->p_node[subtree_index + 1].el;
2088         right_el = right_path->p_node[subtree_index + 1].el;
2089
2090         ocfs2_adjust_root_records(el, left_el, right_el,
2091                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
2092
2093         root_bh = left_path->p_node[subtree_index].bh;
2094
2095         ocfs2_journal_dirty(handle, root_bh);
2096 }
2097
2098 static int ocfs2_rotate_subtree_right(handle_t *handle,
2099                                       struct ocfs2_extent_tree *et,
2100                                       struct ocfs2_path *left_path,
2101                                       struct ocfs2_path *right_path,
2102                                       int subtree_index)
2103 {
2104         int ret, i;
2105         struct buffer_head *right_leaf_bh;
2106         struct buffer_head *left_leaf_bh = NULL;
2107         struct buffer_head *root_bh;
2108         struct ocfs2_extent_list *right_el, *left_el;
2109         struct ocfs2_extent_rec move_rec;
2110
2111         left_leaf_bh = path_leaf_bh(left_path);
2112         left_el = path_leaf_el(left_path);
2113
2114         if (left_el->l_next_free_rec != left_el->l_count) {
2115                 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
2116                             "Inode %llu has non-full interior leaf node %llu"
2117                             "(next free = %u)",
2118                             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2119                             (unsigned long long)left_leaf_bh->b_blocknr,
2120                             le16_to_cpu(left_el->l_next_free_rec));
2121                 return -EROFS;
2122         }
2123
2124         /*
2125          * This extent block may already have an empty record, so we
2126          * return early if so.
2127          */
2128         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
2129                 return 0;
2130
2131         root_bh = left_path->p_node[subtree_index].bh;
2132         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2133
2134         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2135                                            subtree_index);
2136         if (ret) {
2137                 mlog_errno(ret);
2138                 goto out;
2139         }
2140
2141         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2142                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2143                                                    right_path, i);
2144                 if (ret) {
2145                         mlog_errno(ret);
2146                         goto out;
2147                 }
2148
2149                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2150                                                    left_path, i);
2151                 if (ret) {
2152                         mlog_errno(ret);
2153                         goto out;
2154                 }
2155         }
2156
2157         right_leaf_bh = path_leaf_bh(right_path);
2158         right_el = path_leaf_el(right_path);
2159
2160         /* This is a code error, not a disk corruption. */
2161         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
2162                         "because rightmost leaf block %llu is empty\n",
2163                         (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2164                         (unsigned long long)right_leaf_bh->b_blocknr);
2165
2166         ocfs2_create_empty_extent(right_el);
2167
2168         ocfs2_journal_dirty(handle, right_leaf_bh);
2169
2170         /* Do the copy now. */
2171         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
2172         move_rec = left_el->l_recs[i];
2173         right_el->l_recs[0] = move_rec;
2174
2175         /*
2176          * Clear out the record we just copied and shift everything
2177          * over, leaving an empty extent in the left leaf.
2178          *
2179          * We temporarily subtract from next_free_rec so that the
2180          * shift will lose the tail record (which is now defunct).
2181          */
2182         le16_add_cpu(&left_el->l_next_free_rec, -1);
2183         ocfs2_shift_records_right(left_el);
2184         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2185         le16_add_cpu(&left_el->l_next_free_rec, 1);
2186
2187         ocfs2_journal_dirty(handle, left_leaf_bh);
2188
2189         ocfs2_complete_edge_insert(handle, left_path, right_path,
2190                                    subtree_index);
2191
2192 out:
2193         return ret;
2194 }
2195
2196 /*
2197  * Given a full path, determine what cpos value would return us a path
2198  * containing the leaf immediately to the left of the current one.
2199  *
2200  * Will return zero if the path passed in is already the leftmost path.
2201  */
2202 int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2203                                   struct ocfs2_path *path, u32 *cpos)
2204 {
2205         int i, j, ret = 0;
2206         u64 blkno;
2207         struct ocfs2_extent_list *el;
2208
2209         BUG_ON(path->p_tree_depth == 0);
2210
2211         *cpos = 0;
2212
2213         blkno = path_leaf_bh(path)->b_blocknr;
2214
2215         /* Start at the tree node just above the leaf and work our way up. */
2216         i = path->p_tree_depth - 1;
2217         while (i >= 0) {
2218                 el = path->p_node[i].el;
2219
2220                 /*
2221                  * Find the extent record just before the one in our
2222                  * path.
2223                  */
2224                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2225                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2226                                 if (j == 0) {
2227                                         if (i == 0) {
2228                                                 /*
2229                                                  * We've determined that the
2230                                                  * path specified is already
2231                                                  * the leftmost one - return a
2232                                                  * cpos of zero.
2233                                                  */
2234                                                 goto out;
2235                                         }
2236                                         /*
2237                                          * The leftmost record points to our
2238                                          * leaf - we need to travel up the
2239                                          * tree one level.
2240                                          */
2241                                         goto next_node;
2242                                 }
2243
2244                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2245                                 *cpos = *cpos + ocfs2_rec_clusters(el,
2246                                                            &el->l_recs[j - 1]);
2247                                 *cpos = *cpos - 1;
2248                                 goto out;
2249                         }
2250                 }
2251
2252                 /*
2253                  * If we got here, we never found a valid node where
2254                  * the tree indicated one should be.
2255                  */
2256                 ocfs2_error(sb,
2257                             "Invalid extent tree at extent block %llu\n",
2258                             (unsigned long long)blkno);
2259                 ret = -EROFS;
2260                 goto out;
2261
2262 next_node:
2263                 blkno = path->p_node[i].bh->b_blocknr;
2264                 i--;
2265         }
2266
2267 out:
2268         return ret;
2269 }
2270
2271 /*
2272  * Extend the transaction by enough credits to complete the rotation,
2273  * and still leave at least the original number of credits allocated
2274  * to this transaction.
2275  */
2276 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2277                                            int op_credits,
2278                                            struct ocfs2_path *path)
2279 {
2280         int ret = 0;
2281         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2282
2283         if (handle->h_buffer_credits < credits)
2284                 ret = ocfs2_extend_trans(handle,
2285                                          credits - handle->h_buffer_credits);
2286
2287         return ret;
2288 }
2289
2290 /*
2291  * Trap the case where we're inserting into the theoretical range past
2292  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2293  * whose cpos is less than ours into the right leaf.
2294  *
2295  * It's only necessary to look at the rightmost record of the left
2296  * leaf because the logic that calls us should ensure that the
2297  * theoretical ranges in the path components above the leaves are
2298  * correct.
2299  */
2300 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2301                                                  u32 insert_cpos)
2302 {
2303         struct ocfs2_extent_list *left_el;
2304         struct ocfs2_extent_rec *rec;
2305         int next_free;
2306
2307         left_el = path_leaf_el(left_path);
2308         next_free = le16_to_cpu(left_el->l_next_free_rec);
2309         rec = &left_el->l_recs[next_free - 1];
2310
2311         if (insert_cpos > le32_to_cpu(rec->e_cpos))
2312                 return 1;
2313         return 0;
2314 }
2315
2316 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2317 {
2318         int next_free = le16_to_cpu(el->l_next_free_rec);
2319         unsigned int range;
2320         struct ocfs2_extent_rec *rec;
2321
2322         if (next_free == 0)
2323                 return 0;
2324
2325         rec = &el->l_recs[0];
2326         if (ocfs2_is_empty_extent(rec)) {
2327                 /* Empty list. */
2328                 if (next_free == 1)
2329                         return 0;
2330                 rec = &el->l_recs[1];
2331         }
2332
2333         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2334         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2335                 return 1;
2336         return 0;
2337 }
2338
2339 /*
2340  * Rotate all the records in a btree right one record, starting at insert_cpos.
2341  *
2342  * The path to the rightmost leaf should be passed in.
2343  *
2344  * The array is assumed to be large enough to hold an entire path (tree depth).
2345  *
2346  * Upon successful return from this function:
2347  *
2348  * - The 'right_path' array will contain a path to the leaf block
2349  *   whose range contains e_cpos.
2350  * - That leaf block will have a single empty extent in list index 0.
2351  * - In the case that the rotation requires a post-insert update,
2352  *   *ret_left_path will contain a valid path which can be passed to
2353  *   ocfs2_insert_path().
2354  */
2355 static int ocfs2_rotate_tree_right(handle_t *handle,
2356                                    struct ocfs2_extent_tree *et,
2357                                    enum ocfs2_split_type split,
2358                                    u32 insert_cpos,
2359                                    struct ocfs2_path *right_path,
2360                                    struct ocfs2_path **ret_left_path)
2361 {
2362         int ret, start, orig_credits = handle->h_buffer_credits;
2363         u32 cpos;
2364         struct ocfs2_path *left_path = NULL;
2365         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2366
2367         *ret_left_path = NULL;
2368
2369         left_path = ocfs2_new_path_from_path(right_path);
2370         if (!left_path) {
2371                 ret = -ENOMEM;
2372                 mlog_errno(ret);
2373                 goto out;
2374         }
2375
2376         ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2377         if (ret) {
2378                 mlog_errno(ret);
2379                 goto out;
2380         }
2381
2382         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2383
2384         /*
2385          * What we want to do here is:
2386          *
2387          * 1) Start with the rightmost path.
2388          *
2389          * 2) Determine a path to the leaf block directly to the left
2390          *    of that leaf.
2391          *
2392          * 3) Determine the 'subtree root' - the lowest level tree node
2393          *    which contains a path to both leaves.
2394          *
2395          * 4) Rotate the subtree.
2396          *
2397          * 5) Find the next subtree by considering the left path to be
2398          *    the new right path.
2399          *
2400          * The check at the top of this while loop also accepts
2401          * insert_cpos == cpos because cpos is only a _theoretical_
2402          * value to get us the left path - insert_cpos might very well
2403          * be filling that hole.
2404          *
2405          * Stop at a cpos of '0' because we either started at the
2406          * leftmost branch (i.e., a tree with one branch and a
2407          * rotation inside of it), or we've gone as far as we can in
2408          * rotating subtrees.
2409          */
2410         while (cpos && insert_cpos <= cpos) {
2411                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2412                      insert_cpos, cpos);
2413
2414                 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
2415                 if (ret) {
2416                         mlog_errno(ret);
2417                         goto out;
2418                 }
2419
2420                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2421                                 path_leaf_bh(right_path),
2422                                 "Owner %llu: error during insert of %u "
2423                                 "(left path cpos %u) results in two identical "
2424                                 "paths ending at %llu\n",
2425                                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2426                                 insert_cpos, cpos,
2427                                 (unsigned long long)
2428                                 path_leaf_bh(left_path)->b_blocknr);
2429
2430                 if (split == SPLIT_NONE &&
2431                     ocfs2_rotate_requires_path_adjustment(left_path,
2432                                                           insert_cpos)) {
2433
2434                         /*
2435                          * We've rotated the tree as much as we
2436                          * should. The rest is up to
2437                          * ocfs2_insert_path() to complete, after the
2438                          * record insertion. We indicate this
2439                          * situation by returning the left path.
2440                          *
2441                          * The reason we don't adjust the records here
2442                          * before the record insert is that an error
2443                          * later might break the rule where a parent
2444                          * record e_cpos will reflect the actual
2445                          * e_cpos of the 1st nonempty record of the
2446                          * child list.
2447                          */
2448                         *ret_left_path = left_path;
2449                         goto out_ret_path;
2450                 }
2451
2452                 start = ocfs2_find_subtree_root(et, left_path, right_path);
2453
2454                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2455                      start,
2456                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2457                      right_path->p_tree_depth);
2458
2459                 ret = ocfs2_extend_rotate_transaction(handle, start,
2460                                                       orig_credits, right_path);
2461                 if (ret) {
2462                         mlog_errno(ret);
2463                         goto out;
2464                 }
2465
2466                 ret = ocfs2_rotate_subtree_right(handle, et, left_path,
2467                                                  right_path, start);
2468                 if (ret) {
2469                         mlog_errno(ret);
2470                         goto out;
2471                 }
2472
2473                 if (split != SPLIT_NONE &&
2474                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2475                                                 insert_cpos)) {
2476                         /*
2477                          * A rotate moves the rightmost left leaf
2478                          * record over to the leftmost right leaf
2479                          * slot. If we're doing an extent split
2480                          * instead of a real insert, then we have to
2481                          * check that the extent to be split wasn't
2482                          * just moved over. If it was, then we can
2483                          * exit here, passing left_path back -
2484                          * ocfs2_split_extent() is smart enough to
2485                          * search both leaves.
2486                          */
2487                         *ret_left_path = left_path;
2488                         goto out_ret_path;
2489                 }
2490
2491                 /*
2492                  * There is no need to re-read the next right path
2493                  * as we know that it'll be our current left
2494                  * path. Optimize by copying values instead.
2495                  */
2496                 ocfs2_mv_path(right_path, left_path);
2497
2498                 ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2499                 if (ret) {
2500                         mlog_errno(ret);
2501                         goto out;
2502                 }
2503         }
2504
2505 out:
2506         ocfs2_free_path(left_path);
2507
2508 out_ret_path:
2509         return ret;
2510 }
2511
2512 static int ocfs2_update_edge_lengths(handle_t *handle,
2513                                      struct ocfs2_extent_tree *et,
2514                                      int subtree_index, struct ocfs2_path *path)
2515 {
2516         int i, idx, ret;
2517         struct ocfs2_extent_rec *rec;
2518         struct ocfs2_extent_list *el;
2519         struct ocfs2_extent_block *eb;
2520         u32 range;
2521
2522         /*
2523          * In normal tree rotation process, we will never touch the
2524          * tree branch above subtree_index and ocfs2_extend_rotate_transaction
2525          * doesn't reserve the credits for them either.
2526          *
2527          * But we do have a special case here which will update the rightmost
2528          * records for all the bh in the path.
2529          * So we have to allocate extra credits and access them.
2530          */
2531         ret = ocfs2_extend_trans(handle, subtree_index);
2532         if (ret) {
2533                 mlog_errno(ret);
2534                 goto out;
2535         }
2536
2537         ret = ocfs2_journal_access_path(et->et_ci, handle, path);
2538         if (ret) {
2539                 mlog_errno(ret);
2540                 goto out;
2541         }
2542
2543         /* Path should always be rightmost. */
2544         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2545         BUG_ON(eb->h_next_leaf_blk != 0ULL);
2546
2547         el = &eb->h_list;
2548         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2549         idx = le16_to_cpu(el->l_next_free_rec) - 1;
2550         rec = &el->l_recs[idx];
2551         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2552
2553         for (i = 0; i < path->p_tree_depth; i++) {
2554                 el = path->p_node[i].el;
2555                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
2556                 rec = &el->l_recs[idx];
2557
2558                 rec->e_int_clusters = cpu_to_le32(range);
2559                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2560
2561                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2562         }
2563 out:
2564         return ret;
2565 }
2566
2567 static void ocfs2_unlink_path(handle_t *handle,
2568                               struct ocfs2_extent_tree *et,
2569                               struct ocfs2_cached_dealloc_ctxt *dealloc,
2570                               struct ocfs2_path *path, int unlink_start)
2571 {
2572         int ret, i;
2573         struct ocfs2_extent_block *eb;
2574         struct ocfs2_extent_list *el;
2575         struct buffer_head *bh;
2576
2577         for(i = unlink_start; i < path_num_items(path); i++) {
2578                 bh = path->p_node[i].bh;
2579
2580                 eb = (struct ocfs2_extent_block *)bh->b_data;
2581                 /*
2582                  * Not all nodes might have had their final count
2583                  * decremented by the caller - handle this here.
2584                  */
2585                 el = &eb->h_list;
2586                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2587                         mlog(ML_ERROR,
2588                              "Inode %llu, attempted to remove extent block "
2589                              "%llu with %u records\n",
2590                              (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2591                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2592                              le16_to_cpu(el->l_next_free_rec));
2593
2594                         ocfs2_journal_dirty(handle, bh);
2595                         ocfs2_remove_from_cache(et->et_ci, bh);
2596                         continue;
2597                 }
2598
2599                 el->l_next_free_rec = 0;
2600                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2601
2602                 ocfs2_journal_dirty(handle, bh);
2603
2604                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2605                 if (ret)
2606                         mlog_errno(ret);
2607
2608                 ocfs2_remove_from_cache(et->et_ci, bh);
2609         }
2610 }
2611
2612 static void ocfs2_unlink_subtree(handle_t *handle,
2613                                  struct ocfs2_extent_tree *et,
2614                                  struct ocfs2_path *left_path,
2615                                  struct ocfs2_path *right_path,
2616                                  int subtree_index,
2617                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2618 {
2619         int i;
2620         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2621         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2622         struct ocfs2_extent_list *el;
2623         struct ocfs2_extent_block *eb;
2624
2625         el = path_leaf_el(left_path);
2626
2627         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2628
2629         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2630                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2631                         break;
2632
2633         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2634
2635         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2636         le16_add_cpu(&root_el->l_next_free_rec, -1);
2637
2638         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2639         eb->h_next_leaf_blk = 0;
2640
2641         ocfs2_journal_dirty(handle, root_bh);
2642         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2643
2644         ocfs2_unlink_path(handle, et, dealloc, right_path,
2645                           subtree_index + 1);
2646 }
2647
2648 static int ocfs2_rotate_subtree_left(handle_t *handle,
2649                                      struct ocfs2_extent_tree *et,
2650                                      struct ocfs2_path *left_path,
2651                                      struct ocfs2_path *right_path,
2652                                      int subtree_index,
2653                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2654                                      int *deleted)
2655 {
2656         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2657         struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2658         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2659         struct ocfs2_extent_block *eb;
2660
2661         *deleted = 0;
2662
2663         right_leaf_el = path_leaf_el(right_path);
2664         left_leaf_el = path_leaf_el(left_path);
2665         root_bh = left_path->p_node[subtree_index].bh;
2666         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2667
2668         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2669                 return 0;
2670
2671         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2672         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2673                 /*
2674                  * It's legal for us to proceed if the right leaf is
2675                  * the rightmost one and it has an empty extent. There
2676                  * are two cases to handle - whether the leaf will be
2677                  * empty after removal or not. If the leaf isn't empty
2678                  * then just remove the empty extent up front. The
2679                  * next block will handle empty leaves by flagging
2680                  * them for unlink.
2681                  *
2682                  * Non rightmost leaves will throw -EAGAIN and the
2683                  * caller can manually move the subtree and retry.
2684                  */
2685
2686                 if (eb->h_next_leaf_blk != 0ULL)
2687                         return -EAGAIN;
2688
2689                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2690                         ret = ocfs2_journal_access_eb(handle, et->et_ci,
2691                                                       path_leaf_bh(right_path),
2692                                                       OCFS2_JOURNAL_ACCESS_WRITE);
2693                         if (ret) {
2694                                 mlog_errno(ret);
2695                                 goto out;
2696                         }
2697
2698                         ocfs2_remove_empty_extent(right_leaf_el);
2699                 } else
2700                         right_has_empty = 1;
2701         }
2702
2703         if (eb->h_next_leaf_blk == 0ULL &&
2704             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2705                 /*
2706                  * We have to update i_last_eb_blk during the meta
2707                  * data delete.
2708                  */
2709                 ret = ocfs2_et_root_journal_access(handle, et,
2710                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2711                 if (ret) {
2712                         mlog_errno(ret);
2713                         goto out;
2714                 }
2715
2716                 del_right_subtree = 1;
2717         }
2718
2719         /*
2720          * Getting here with an empty extent in the right path implies
2721          * that it's the rightmost path and will be deleted.
2722          */
2723         BUG_ON(right_has_empty && !del_right_subtree);
2724
2725         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2726                                            subtree_index);
2727         if (ret) {
2728                 mlog_errno(ret);
2729                 goto out;
2730         }
2731
2732         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2733                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2734                                                    right_path, i);
2735                 if (ret) {
2736                         mlog_errno(ret);
2737                         goto out;
2738                 }
2739
2740                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2741                                                    left_path, i);
2742                 if (ret) {
2743                         mlog_errno(ret);
2744                         goto out;
2745                 }
2746         }
2747
2748         if (!right_has_empty) {
2749                 /*
2750                  * Only do this if we're moving a real
2751                  * record. Otherwise, the action is delayed until
2752                  * after removal of the right path in which case we
2753                  * can do a simple shift to remove the empty extent.
2754                  */
2755                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2756                 memset(&right_leaf_el->l_recs[0], 0,
2757                        sizeof(struct ocfs2_extent_rec));
2758         }
2759         if (eb->h_next_leaf_blk == 0ULL) {
2760                 /*
2761                  * Move recs over to get rid of empty extent, decrease
2762                  * next_free. This is allowed to remove the last
2763                  * extent in our leaf (setting l_next_free_rec to
2764                  * zero) - the delete code below won't care.
2765                  */
2766                 ocfs2_remove_empty_extent(right_leaf_el);
2767         }
2768
2769         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2770         ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2771
2772         if (del_right_subtree) {
2773                 ocfs2_unlink_subtree(handle, et, left_path, right_path,
2774                                      subtree_index, dealloc);
2775                 ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
2776                                                 left_path);
2777                 if (ret) {
2778                         mlog_errno(ret);
2779                         goto out;
2780                 }
2781
2782                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2783                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2784
2785                 /*
2786                  * Removal of the extent in the left leaf was skipped
2787                  * above so we could delete the right path
2788                  * 1st.
2789                  */
2790                 if (right_has_empty)
2791                         ocfs2_remove_empty_extent(left_leaf_el);
2792
2793                 ocfs2_journal_dirty(handle, et_root_bh);
2794
2795                 *deleted = 1;
2796         } else
2797                 ocfs2_complete_edge_insert(handle, left_path, right_path,
2798                                            subtree_index);
2799
2800 out:
2801         return ret;
2802 }
2803
2804 /*
2805  * Given a full path, determine what cpos value would return us a path
2806  * containing the leaf immediately to the right of the current one.
2807  *
2808  * Will return zero if the path passed in is already the rightmost path.
2809  *
2810  * This looks similar, but is subtly different to
2811  * ocfs2_find_cpos_for_left_leaf().
2812  */
2813 int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2814                                    struct ocfs2_path *path, u32 *cpos)
2815 {
2816         int i, j, ret = 0;
2817         u64 blkno;
2818         struct ocfs2_extent_list *el;
2819
2820         *cpos = 0;
2821
2822         if (path->p_tree_depth == 0)
2823                 return 0;
2824
2825         blkno = path_leaf_bh(path)->b_blocknr;
2826
2827         /* Start at the tree node just above the leaf and work our way up. */
2828         i = path->p_tree_depth - 1;
2829         while (i >= 0) {
2830                 int next_free;
2831
2832                 el = path->p_node[i].el;
2833
2834                 /*
2835                  * Find the extent record just after the one in our
2836                  * path.
2837                  */
2838                 next_free = le16_to_cpu(el->l_next_free_rec);
2839                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2840                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2841                                 if (j == (next_free - 1)) {
2842                                         if (i == 0) {
2843                                                 /*
2844                                                  * We've determined that the
2845                                                  * path specified is already
2846                                                  * the rightmost one - return a
2847                                                  * cpos of zero.
2848                                                  */
2849                                                 goto out;
2850                                         }
2851                                         /*
2852                                          * The rightmost record points to our
2853                                          * leaf - we need to travel up the
2854                                          * tree one level.
2855                                          */
2856                                         goto next_node;
2857                                 }
2858
2859                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2860                                 goto out;
2861                         }
2862                 }
2863
2864                 /*
2865                  * If we got here, we never found a valid node where
2866                  * the tree indicated one should be.
2867                  */
2868                 ocfs2_error(sb,
2869                             "Invalid extent tree at extent block %llu\n",
2870                             (unsigned long long)blkno);
2871                 ret = -EROFS;
2872                 goto out;
2873
2874 next_node:
2875                 blkno = path->p_node[i].bh->b_blocknr;
2876                 i--;
2877         }
2878
2879 out:
2880         return ret;
2881 }
2882
2883 static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2884                                             struct ocfs2_extent_tree *et,
2885                                             struct ocfs2_path *path)
2886 {
2887         int ret;
2888         struct buffer_head *bh = path_leaf_bh(path);
2889         struct ocfs2_extent_list *el = path_leaf_el(path);
2890
2891         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2892                 return 0;
2893
2894         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
2895                                            path_num_items(path) - 1);
2896         if (ret) {
2897                 mlog_errno(ret);
2898                 goto out;
2899         }
2900
2901         ocfs2_remove_empty_extent(el);
2902         ocfs2_journal_dirty(handle, bh);
2903
2904 out:
2905         return ret;
2906 }
2907
2908 static int __ocfs2_rotate_tree_left(handle_t *handle,
2909                                     struct ocfs2_extent_tree *et,
2910                                     int orig_credits,
2911                                     struct ocfs2_path *path,
2912                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2913                                     struct ocfs2_path **empty_extent_path)
2914 {
2915         int ret, subtree_root, deleted;
2916         u32 right_cpos;
2917         struct ocfs2_path *left_path = NULL;
2918         struct ocfs2_path *right_path = NULL;
2919         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2920
2921         BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2922
2923         *empty_extent_path = NULL;
2924
2925         ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
2926         if (ret) {
2927                 mlog_errno(ret);
2928                 goto out;
2929         }
2930
2931         left_path = ocfs2_new_path_from_path(path);
2932         if (!left_path) {
2933                 ret = -ENOMEM;
2934                 mlog_errno(ret);
2935                 goto out;
2936         }
2937
2938         ocfs2_cp_path(left_path, path);
2939
2940         right_path = ocfs2_new_path_from_path(path);
2941         if (!right_path) {
2942                 ret = -ENOMEM;
2943                 mlog_errno(ret);
2944                 goto out;
2945         }
2946
2947         while (right_cpos) {
2948                 ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
2949                 if (ret) {
2950                         mlog_errno(ret);
2951                         goto out;
2952                 }
2953
2954                 subtree_root = ocfs2_find_subtree_root(et, left_path,
2955                                                        right_path);
2956
2957                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2958                      subtree_root,
2959                      (unsigned long long)
2960                      right_path->p_node[subtree_root].bh->b_blocknr,
2961                      right_path->p_tree_depth);
2962
2963                 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2964                                                       orig_credits, left_path);
2965                 if (ret) {
2966                         mlog_errno(ret);
2967                         goto out;
2968                 }
2969
2970                 /*
2971                  * Caller might still want to make changes to the
2972                  * tree root, so re-add it to the journal here.
2973                  */
2974                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2975                                                    left_path, 0);
2976                 if (ret) {
2977                         mlog_errno(ret);
2978                         goto out;
2979                 }
2980
2981                 ret = ocfs2_rotate_subtree_left(handle, et, left_path,
2982                                                 right_path, subtree_root,
2983                                                 dealloc, &deleted);
2984                 if (ret == -EAGAIN) {
2985                         /*
2986                          * The rotation has to temporarily stop due to
2987                          * the right subtree having an empty
2988                          * extent. Pass it back to the caller for a
2989                          * fixup.
2990                          */
2991                         *empty_extent_path = right_path;
2992                         right_path = NULL;
2993                         goto out;
2994                 }
2995                 if (ret) {
2996                         mlog_errno(ret);
2997                         goto out;
2998                 }
2999
3000                 /*
3001                  * The subtree rotate might have removed records on
3002                  * the rightmost edge. If so, then rotation is
3003                  * complete.
3004                  */
3005                 if (deleted)
3006                         break;
3007
3008                 ocfs2_mv_path(left_path, right_path);
3009
3010                 ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
3011                                                      &right_cpos);
3012                 if (ret) {
3013                         mlog_errno(ret);
3014                         goto out;
3015                 }
3016         }
3017
3018 out:
3019         ocfs2_free_path(right_path);
3020         ocfs2_free_path(left_path);
3021
3022         return ret;
3023 }
3024
3025 static int ocfs2_remove_rightmost_path(handle_t *handle,
3026                                 struct ocfs2_extent_tree *et,
3027                                 struct ocfs2_path *path,
3028                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
3029 {
3030         int ret, subtree_index;
3031         u32 cpos;
3032         struct ocfs2_path *left_path = NULL;
3033         struct ocfs2_extent_block *eb;
3034         struct ocfs2_extent_list *el;
3035
3036
3037         ret = ocfs2_et_sanity_check(et);
3038         if (ret)
3039                 goto out;
3040         /*
3041          * There's two ways we handle this depending on
3042          * whether path is the only existing one.
3043          */
3044         ret = ocfs2_extend_rotate_transaction(handle, 0,
3045                                               handle->h_buffer_credits,
3046                                               path);
3047         if (ret) {
3048                 mlog_errno(ret);
3049                 goto out;
3050         }
3051
3052         ret = ocfs2_journal_access_path(et->et_ci, handle, path);
3053         if (ret) {
3054                 mlog_errno(ret);
3055                 goto out;
3056         }
3057
3058         ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3059                                             path, &cpos);
3060         if (ret) {
3061                 mlog_errno(ret);
3062                 goto out;
3063         }
3064
3065         if (cpos) {
3066                 /*
3067                  * We have a path to the left of this one - it needs
3068                  * an update too.
3069                  */
3070                 left_path = ocfs2_new_path_from_path(path);
3071                 if (!left_path) {
3072                         ret = -ENOMEM;
3073                         mlog_errno(ret);
3074                         goto out;
3075                 }
3076
3077                 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
3078                 if (ret) {
3079                         mlog_errno(ret);
3080                         goto out;
3081                 }
3082
3083                 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
3084                 if (ret) {
3085                         mlog_errno(ret);
3086                         goto out;
3087                 }
3088
3089                 subtree_index = ocfs2_find_subtree_root(et, left_path, path);
3090
3091                 ocfs2_unlink_subtree(handle, et, left_path, path,
3092                                      subtree_index, dealloc);
3093                 ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
3094                                                 left_path);
3095                 if (ret) {
3096                         mlog_errno(ret);
3097                         goto out;
3098                 }
3099
3100                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
3101                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
3102         } else {
3103                 /*
3104                  * 'path' is also the leftmost path which
3105                  * means it must be the only one. This gets
3106                  * handled differently because we want to
3107                  * revert the root back to having extents
3108                  * in-line.
3109                  */
3110                 ocfs2_unlink_path(handle, et, dealloc, path, 1);
3111
3112                 el = et->et_root_el;
3113                 el->l_tree_depth = 0;
3114                 el->l_next_free_rec = 0;
3115                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3116
3117                 ocfs2_et_set_last_eb_blk(et, 0);
3118         }
3119
3120         ocfs2_journal_dirty(handle, path_root_bh(path));
3121
3122 out:
3123         ocfs2_free_path(left_path);
3124         return ret;
3125 }
3126
3127 /*
3128  * Left rotation of btree records.
3129  *
3130  * In many ways, this is (unsurprisingly) the opposite of right
3131  * rotation. We start at some non-rightmost path containing an empty
3132  * extent in the leaf block. The code works its way to the rightmost
3133  * path by rotating records to the left in every subtree.
3134  *
3135  * This is used by any code which reduces the number of extent records
3136  * in a leaf. After removal, an empty record should be placed in the
3137  * leftmost list position.
3138  *
3139  * This won't handle a length update of the rightmost path records if
3140  * the rightmost tree leaf record is removed so the caller is
3141  * responsible for detecting and correcting that.
3142  */
3143 static int ocfs2_rotate_tree_left(handle_t *handle,
3144                                   struct ocfs2_extent_tree *et,
3145                                   struct ocfs2_path *path,
3146                                   struct ocfs2_cached_dealloc_ctxt *dealloc)
3147 {
3148         int ret, orig_credits = handle->h_buffer_credits;
3149         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
3150         struct ocfs2_extent_block *eb;
3151         struct ocfs2_extent_list *el;
3152
3153         el = path_leaf_el(path);
3154         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
3155                 return 0;
3156
3157         if (path->p_tree_depth == 0) {
3158 rightmost_no_delete:
3159                 /*
3160                  * Inline extents. This is trivially handled, so do
3161                  * it up front.
3162                  */
3163                 ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
3164                 if (ret)
3165                         mlog_errno(ret);
3166                 goto out;
3167         }
3168
3169         /*
3170          * Handle rightmost branch now. There's several cases:
3171          *  1) simple rotation leaving records in there. That's trivial.
3172          *  2) rotation requiring a branch delete - there's no more
3173          *     records left. Two cases of this:
3174          *     a) There are branches to the left.
3175          *     b) This is also the leftmost (the only) branch.
3176          *
3177          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
3178          *  2a) we need the left branch so that we can update it with the unlink
3179          *  2b) we need to bring the root back to inline extents.
3180          */
3181
3182         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
3183         el = &eb->h_list;
3184         if (eb->h_next_leaf_blk == 0) {
3185                 /*
3186                  * This gets a bit tricky if we're going to delete the
3187                  * rightmost path. Get the other cases out of the way
3188                  * 1st.
3189                  */
3190                 if (le16_to_cpu(el->l_next_free_rec) > 1)
3191                         goto rightmost_no_delete;
3192
3193                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
3194                         ret = -EIO;
3195                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3196                                     "Owner %llu has empty extent block at %llu",
3197                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
3198                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
3199                         goto out;
3200                 }
3201
3202                 /*
3203                  * XXX: The caller can not trust "path" any more after
3204                  * this as it will have been deleted. What do we do?
3205                  *
3206                  * In theory the rotate-for-merge code will never get
3207                  * here because it'll always ask for a rotate in a
3208                  * nonempty list.
3209                  */
3210
3211                 ret = ocfs2_remove_rightmost_path(handle, et, path,
3212                                                   dealloc);
3213                 if (ret)
3214                         mlog_errno(ret);
3215                 goto out;
3216         }
3217
3218         /*
3219          * Now we can loop, remembering the path we get from -EAGAIN
3220          * and restarting from there.
3221          */
3222 try_rotate:
3223         ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
3224                                        dealloc, &restart_path);
3225         if (ret && ret != -EAGAIN) {
3226                 mlog_errno(ret);
3227                 goto out;
3228         }
3229
3230         while (ret == -EAGAIN) {
3231                 tmp_path = restart_path;
3232                 restart_path = NULL;
3233
3234                 ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
3235                                                tmp_path, dealloc,
3236                                                &restart_path);
3237                 if (ret && ret != -EAGAIN) {
3238                         mlog_errno(ret);
3239                         goto out;
3240                 }
3241
3242                 ocfs2_free_path(tmp_path);
3243                 tmp_path = NULL;
3244
3245                 if (ret == 0)
3246                         goto try_rotate;
3247         }
3248
3249 out:
3250         ocfs2_free_path(tmp_path);
3251         ocfs2_free_path(restart_path);
3252         return ret;
3253 }
3254
3255 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3256                                 int index)
3257 {
3258         struct ocfs2_extent_rec *rec = &el->l_recs[index];
3259         unsigned int size;
3260
3261         if (rec->e_leaf_clusters == 0) {
3262                 /*
3263                  * We consumed all of the merged-from record. An empty
3264                  * extent cannot exist anywhere but the 1st array
3265                  * position, so move things over if the merged-from
3266                  * record doesn't occupy that position.
3267                  *
3268                  * This creates a new empty extent so the caller
3269                  * should be smart enough to have removed any existing
3270                  * ones.
3271                  */
3272                 if (index > 0) {
3273                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3274                         size = index * sizeof(struct ocfs2_extent_rec);
3275                         memmove(&el->l_recs[1], &el->l_recs[0], size);
3276                 }
3277
3278                 /*
3279                  * Always memset - the caller doesn't check whether it
3280                  * created an empty extent, so there could be junk in
3281                  * the other fields.
3282                  */
3283                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3284         }
3285 }
3286
3287 static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
3288                                 struct ocfs2_path *left_path,
3289                                 struct ocfs2_path **ret_right_path)
3290 {
3291         int ret;
3292         u32 right_cpos;
3293         struct ocfs2_path *right_path = NULL;
3294         struct ocfs2_extent_list *left_el;
3295
3296         *ret_right_path = NULL;
3297
3298         /* This function shouldn't be called for non-trees. */
3299         BUG_ON(left_path->p_tree_depth == 0);
3300
3301         left_el = path_leaf_el(left_path);
3302         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3303
3304         ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3305                                              left_path, &right_cpos);
3306         if (ret) {
3307                 mlog_errno(ret);
3308                 goto out;
3309         }
3310
3311         /* This function shouldn't be called for the rightmost leaf. */
3312         BUG_ON(right_cpos == 0);
3313
3314         right_path = ocfs2_new_path_from_path(left_path);
3315         if (!right_path) {
3316                 ret = -ENOMEM;
3317                 mlog_errno(ret);
3318                 goto out;
3319         }
3320
3321         ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
3322         if (ret) {
3323                 mlog_errno(ret);
3324                 goto out;
3325         }
3326
3327         *ret_right_path = right_path;
3328 out:
3329         if (ret)
3330                 ocfs2_free_path(right_path);
3331         return ret;
3332 }
3333
3334 /*
3335  * Remove split_rec clusters from the record at index and merge them
3336  * onto the beginning of the record "next" to it.
3337  * For index < l_count - 1, the next means the extent rec at index + 1.
3338  * For index == l_count - 1, the "next" means the 1st extent rec of the
3339  * next extent block.
3340  */
3341 static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3342                                  handle_t *handle,
3343                                  struct ocfs2_extent_tree *et,
3344                                  struct ocfs2_extent_rec *split_rec,
3345                                  int index)
3346 {
3347         int ret, next_free, i;
3348         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3349         struct ocfs2_extent_rec *left_rec;
3350         struct ocfs2_extent_rec *right_rec;
3351         struct ocfs2_extent_list *right_el;
3352         struct ocfs2_path *right_path = NULL;
3353         int subtree_index = 0;
3354         struct ocfs2_extent_list *el = path_leaf_el(left_path);
3355         struct buffer_head *bh = path_leaf_bh(left_path);
3356         struct buffer_head *root_bh = NULL;
3357
3358         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3359         left_rec = &el->l_recs[index];
3360
3361         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3362             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3363                 /* we meet with a cross extent block merge. */
3364                 ret = ocfs2_get_right_path(et, left_path, &right_path);
3365                 if (ret) {
3366                         mlog_errno(ret);
3367                         goto out;
3368                 }
3369
3370                 right_el = path_leaf_el(right_path);
3371                 next_free = le16_to_cpu(right_el->l_next_free_rec);
3372                 BUG_ON(next_free <= 0);
3373                 right_rec = &right_el->l_recs[0];
3374                 if (ocfs2_is_empty_extent(right_rec)) {
3375                         BUG_ON(next_free <= 1);
3376                         right_rec = &right_el->l_recs[1];
3377                 }
3378
3379                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3380                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3381                        le32_to_cpu(right_rec->e_cpos));
3382
3383                 subtree_index = ocfs2_find_subtree_root(et, left_path,
3384                                                         right_path);
3385
3386                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3387                                                       handle->h_buffer_credits,
3388                                                       right_path);
3389                 if (ret) {
3390                         mlog_errno(ret);
3391                         goto out;
3392                 }
3393
3394                 root_bh = left_path->p_node[subtree_index].bh;
3395                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3396
3397                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3398                                                    subtree_index);
3399                 if (ret) {
3400                         mlog_errno(ret);
3401                         goto out;
3402                 }
3403
3404                 for (i = subtree_index + 1;
3405                      i < path_num_items(right_path); i++) {
3406                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3407                                                            right_path, i);
3408                         if (ret) {
3409                                 mlog_errno(ret);
3410                                 goto out;
3411                         }
3412
3413                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3414                                                            left_path, i);
3415                         if (ret) {
3416                                 mlog_errno(ret);
3417                                 goto out;
3418                         }
3419                 }
3420
3421         } else {
3422                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3423                 right_rec = &el->l_recs[index + 1];
3424         }
3425
3426         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
3427                                            path_num_items(left_path) - 1);
3428         if (ret) {
3429                 mlog_errno(ret);
3430                 goto out;
3431         }
3432
3433         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3434
3435         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3436         le64_add_cpu(&right_rec->e_blkno,
3437                      -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3438                                                split_clusters));
3439         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3440
3441         ocfs2_cleanup_merge(el, index);
3442
3443         ocfs2_journal_dirty(handle, bh);
3444         if (right_path) {
3445                 ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3446                 ocfs2_complete_edge_insert(handle, left_path, right_path,
3447                                            subtree_index);
3448         }
3449 out:
3450         if (right_path)
3451                 ocfs2_free_path(right_path);
3452         return ret;
3453 }
3454
3455 static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
3456                                struct ocfs2_path *right_path,
3457                                struct ocfs2_path **ret_left_path)
3458 {
3459         int ret;
3460         u32 left_cpos;
3461         struct ocfs2_path *left_path = NULL;
3462
3463         *ret_left_path = NULL;
3464
3465         /* This function shouldn't be called for non-trees. */
3466         BUG_ON(right_path->p_tree_depth == 0);
3467
3468         ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3469                                             right_path, &left_cpos);
3470         if (ret) {
3471                 mlog_errno(ret);
3472                 goto out;
3473         }
3474
3475         /* This function shouldn't be called for the leftmost leaf. */
3476         BUG_ON(left_cpos == 0);
3477
3478         left_path = ocfs2_new_path_from_path(right_path);
3479         if (!left_path) {
3480                 ret = -ENOMEM;
3481                 mlog_errno(ret);
3482                 goto out;
3483         }
3484
3485         ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
3486         if (ret) {
3487                 mlog_errno(ret);
3488                 goto out;
3489         }
3490
3491         *ret_left_path = left_path;
3492 out:
3493         if (ret)
3494                 ocfs2_free_path(left_path);
3495         return ret;
3496 }
3497
3498 /*
3499  * Remove split_rec clusters from the record at index and merge them
3500  * onto the tail of the record "before" it.
3501  * For index > 0, the "before" means the extent rec at index - 1.
3502  *
3503  * For index == 0, the "before" means the last record of the previous
3504  * extent block. And there is also a situation that we may need to
3505  * remove the rightmost leaf extent block in the right_path and change
3506  * the right path to indicate the new rightmost path.
3507  */
3508 static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3509                                 handle_t *handle,
3510                                 struct ocfs2_extent_tree *et,
3511                                 struct ocfs2_extent_rec *split_rec,
3512                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
3513                                 int index)
3514 {
3515         int ret, i, subtree_index = 0, has_empty_extent = 0;
3516         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3517         struct ocfs2_extent_rec *left_rec;
3518         struct ocfs2_extent_rec *right_rec;
3519         struct ocfs2_extent_list *el = path_leaf_el(right_path);
3520         struct buffer_head *bh = path_leaf_bh(right_path);
3521         struct buffer_head *root_bh = NULL;
3522         struct ocfs2_path *left_path = NULL;
3523         struct ocfs2_extent_list *left_el;
3524
3525         BUG_ON(index < 0);
3526
3527         right_rec = &el->l_recs[index];
3528         if (index == 0) {
3529                 /* we meet with a cross extent block merge. */
3530                 ret = ocfs2_get_left_path(et, right_path, &left_path);
3531                 if (ret) {
3532                         mlog_errno(ret);
3533                         goto out;
3534                 }
3535
3536                 left_el = path_leaf_el(left_path);
3537                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3538                        le16_to_cpu(left_el->l_count));
3539
3540                 left_rec = &left_el->l_recs[
3541                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
3542                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3543                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3544                        le32_to_cpu(split_rec->e_cpos));
3545
3546                 subtree_index = ocfs2_find_subtree_root(et, left_path,
3547                                                         right_path);
3548
3549                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3550                                                       handle->h_buffer_credits,
3551                                                       left_path);
3552                 if (ret) {
3553                         mlog_errno(ret);
3554                         goto out;
3555                 }
3556
3557                 root_bh = left_path->p_node[subtree_index].bh;
3558                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3559
3560                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3561                                                    subtree_index);
3562                 if (ret) {
3563                         mlog_errno(ret);
3564                         goto out;
3565                 }
3566
3567                 for (i = subtree_index + 1;
3568                      i < path_num_items(right_path); i++) {
3569                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3570                                                            right_path, i);
3571                         if (ret) {
3572                                 mlog_errno(ret);
3573                                 goto out;
3574                         }
3575
3576                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3577                                                            left_path, i);
3578                         if (ret) {
3579                                 mlog_errno(ret);
3580                                 goto out;
3581                         }
3582                 }
3583         } else {
3584                 left_rec = &el->l_recs[index - 1];
3585                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3586                         has_empty_extent = 1;
3587         }
3588
3589         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3590                                            path_num_items(right_path) - 1);
3591         if (ret) {
3592                 mlog_errno(ret);
3593                 goto out;
3594         }
3595
3596         if (has_empty_extent && index == 1) {
3597                 /*
3598                  * The easy case - we can just plop the record right in.
3599                  */
3600                 *left_rec = *split_rec;
3601
3602                 has_empty_extent = 0;
3603         } else
3604                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3605
3606         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3607         le64_add_cpu(&right_rec->e_blkno,
3608                      ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3609                                               split_clusters));
3610         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3611
3612         ocfs2_cleanup_merge(el, index);
3613
3614         ocfs2_journal_dirty(handle, bh);
3615         if (left_path) {
3616                 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3617
3618                 /*
3619                  * In the situation that the right_rec is empty and the extent
3620                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3621                  * it and we need to delete the right extent block.
3622                  */
3623                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3624                     le16_to_cpu(el->l_next_free_rec) == 1) {
3625
3626                         ret = ocfs2_remove_rightmost_path(handle, et,
3627                                                           right_path,
3628                                                           dealloc);
3629                         if (ret) {
3630                                 mlog_errno(ret);
3631                                 goto out;
3632                         }
3633
3634                         /* Now the rightmost extent block has been deleted.
3635                          * So we use the new rightmost path.
3636                          */
3637                         ocfs2_mv_path(right_path, left_path);
3638                         left_path = NULL;
3639                 } else
3640                         ocfs2_complete_edge_insert(handle, left_path,
3641                                                    right_path, subtree_index);
3642         }
3643 out:
3644         if (left_path)
3645                 ocfs2_free_path(left_path);
3646         return ret;
3647 }
3648
3649 static int ocfs2_try_to_merge_extent(handle_t *handle,
3650                                      struct ocfs2_extent_tree *et,
3651                                      struct ocfs2_path *path,
3652                                      int split_index,
3653                                      struct ocfs2_extent_rec *split_rec,
3654                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3655                                      struct ocfs2_merge_ctxt *ctxt)
3656 {
3657         int ret = 0;
3658         struct ocfs2_extent_list *el = path_leaf_el(path);
3659         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3660
3661         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3662
3663         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3664                 /*
3665                  * The merge code will need to create an empty
3666                  * extent to take the place of the newly
3667                  * emptied slot. Remove any pre-existing empty
3668                  * extents - having more than one in a leaf is
3669                  * illegal.
3670                  */
3671                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3672                 if (ret) {
3673                         mlog_errno(ret);
3674                         goto out;
3675                 }
3676                 split_index--;
3677                 rec = &el->l_recs[split_index];
3678         }
3679
3680         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3681                 /*
3682                  * Left-right contig implies this.
3683                  */
3684                 BUG_ON(!ctxt->c_split_covers_rec);
3685
3686                 /*
3687                  * Since the leftright insert always covers the entire
3688                  * extent, this call will delete the insert record
3689                  * entirely, resulting in an empty extent record added to
3690                  * the extent block.
3691                  *
3692                  * Since the adding of an empty extent shifts
3693                  * everything back to the right, there's no need to
3694                  * update split_index here.
3695                  *
3696                  * When the split_index is zero, we need to merge it to the
3697                  * prevoius extent block. It is more efficient and easier
3698                  * if we do merge_right first and merge_left later.
3699                  */
3700                 ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
3701                                             split_index);
3702                 if (ret) {
3703                         mlog_errno(ret);
3704                         goto out;
3705                 }
3706
3707                 /*
3708                  * We can only get this from logic error above.
3709                  */
3710                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3711
3712                 /* The merge left us with an empty extent, remove it. */
3713                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3714                 if (ret) {
3715                         mlog_errno(ret);
3716                         goto out;
3717                 }
3718
3719                 rec = &el->l_recs[split_index];
3720
3721                 /*
3722                  * Note that we don't pass split_rec here on purpose -
3723                  * we've merged it into the rec already.
3724                  */
3725                 ret = ocfs2_merge_rec_left(path, handle, et, rec,
3726                                            dealloc, split_index);
3727
3728                 if (ret) {
3729                         mlog_errno(ret);
3730                         goto out;
3731                 }
3732
3733                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3734                 /*
3735                  * Error from this last rotate is not critical, so
3736                  * print but don't bubble it up.
3737                  */
3738                 if (ret)
3739                         mlog_errno(ret);
3740                 ret = 0;
3741         } else {
3742                 /*
3743                  * Merge a record to the left or right.
3744                  *
3745                  * 'contig_type' is relative to the existing record,
3746                  * so for example, if we're "right contig", it's to
3747                  * the record on the left (hence the left merge).
3748                  */
3749                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3750                         ret = ocfs2_merge_rec_left(path, handle, et,
3751                                                    split_rec, dealloc,
3752                                                    split_index);
3753                         if (ret) {
3754                                 mlog_errno(ret);
3755                                 goto out;
3756                         }
3757                 } else {
3758                         ret = ocfs2_merge_rec_right(path, handle,
3759                                                     et, split_rec,
3760                                                     split_index);
3761                         if (ret) {
3762                                 mlog_errno(ret);
3763                                 goto out;
3764                         }
3765                 }
3766
3767                 if (ctxt->c_split_covers_rec) {
3768                         /*
3769                          * The merge may have left an empty extent in
3770                          * our leaf. Try to rotate it away.
3771                          */
3772                         ret = ocfs2_rotate_tree_left(handle, et, path,
3773                                                      dealloc);
3774                         if (ret)
3775                                 mlog_errno(ret);
3776                         ret = 0;
3777                 }
3778         }
3779
3780 out:
3781         return ret;
3782 }
3783
3784 static void ocfs2_subtract_from_rec(struct super_block *sb,
3785                                     enum ocfs2_split_type split,
3786                                     struct ocfs2_extent_rec *rec,
3787                                     struct ocfs2_extent_rec *split_rec)
3788 {
3789         u64 len_blocks;
3790
3791         len_blocks = ocfs2_clusters_to_blocks(sb,
3792                                 le16_to_cpu(split_rec->e_leaf_clusters));
3793
3794         if (split == SPLIT_LEFT) {
3795                 /*
3796                  * Region is on the left edge of the existing
3797                  * record.
3798                  */
3799                 le32_add_cpu(&rec->e_cpos,
3800                              le16_to_cpu(split_rec->e_leaf_clusters));
3801                 le64_add_cpu(&rec->e_blkno, len_blocks);
3802                 le16_add_cpu(&rec->e_leaf_clusters,
3803                              -le16_to_cpu(split_rec->e_leaf_clusters));
3804         } else {
3805                 /*
3806                  * Region is on the right edge of the existing
3807                  * record.
3808                  */
3809                 le16_add_cpu(&rec->e_leaf_clusters,
3810                              -le16_to_cpu(split_rec->e_leaf_clusters));
3811         }
3812 }
3813
3814 /*
3815  * Do the final bits of extent record insertion at the target leaf
3816  * list. If this leaf is part of an allocation tree, it is assumed
3817  * that the tree above has been prepared.
3818  */
3819 static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
3820                                  struct ocfs2_extent_rec *insert_rec,
3821                                  struct ocfs2_extent_list *el,
3822                                  struct ocfs2_insert_type *insert)
3823 {
3824         int i = insert->ins_contig_index;
3825         unsigned int range;
3826         struct ocfs2_extent_rec *rec;
3827
3828         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3829
3830         if (insert->ins_split != SPLIT_NONE) {
3831                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3832                 BUG_ON(i == -1);
3833                 rec = &el->l_recs[i];
3834                 ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
3835                                         insert->ins_split, rec,
3836                                         insert_rec);
3837                 goto rotate;
3838         }
3839
3840         /*
3841          * Contiguous insert - either left or right.
3842          */
3843         if (insert->ins_contig != CONTIG_NONE) {
3844                 rec = &el->l_recs[i];
3845                 if (insert->ins_contig == CONTIG_LEFT) {
3846                         rec->e_blkno = insert_rec->e_blkno;
3847                         rec->e_cpos = insert_rec->e_cpos;
3848                 }
3849                 le16_add_cpu(&rec->e_leaf_clusters,
3850                              le16_to_cpu(insert_rec->e_leaf_clusters));
3851                 return;
3852         }
3853
3854         /*
3855          * Handle insert into an empty leaf.
3856          */
3857         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3858             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3859              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3860                 el->l_recs[0] = *insert_rec;
3861                 el->l_next_free_rec = cpu_to_le16(1);
3862                 return;
3863         }
3864
3865         /*
3866          * Appending insert.
3867          */
3868         if (insert->ins_appending == APPEND_TAIL) {
3869                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3870                 rec = &el->l_recs[i];
3871                 range = le32_to_cpu(rec->e_cpos)
3872                         + le16_to_cpu(rec->e_leaf_clusters);
3873                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3874
3875                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3876                                 le16_to_cpu(el->l_count),
3877                                 "owner %llu, depth %u, count %u, next free %u, "
3878                                 "rec.cpos %u, rec.clusters %u, "
3879                                 "insert.cpos %u, insert.clusters %u\n",
3880                                 ocfs2_metadata_cache_owner(et->et_ci),
3881                                 le16_to_cpu(el->l_tree_depth),
3882                                 le16_to_cpu(el->l_count),
3883                                 le16_to_cpu(el->l_next_free_rec),
3884                                 le32_to_cpu(el->l_recs[i].e_cpos),
3885                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3886                                 le32_to_cpu(insert_rec->e_cpos),
3887                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3888                 i++;
3889                 el->l_recs[i] = *insert_rec;
3890                 le16_add_cpu(&el->l_next_free_rec, 1);
3891                 return;
3892         }
3893
3894 rotate:
3895         /*
3896          * Ok, we have to rotate.
3897          *
3898          * At this point, it is safe to assume that inserting into an
3899          * empty leaf and appending to a leaf have both been handled
3900          * above.
3901          *
3902          * This leaf needs to have space, either by the empty 1st
3903          * extent record, or by virtue of an l_next_rec < l_count.
3904          */
3905         ocfs2_rotate_leaf(el, insert_rec);
3906 }
3907
3908 static void ocfs2_adjust_rightmost_records(handle_t *handle,
3909                                            struct ocfs2_extent_tree *et,
3910                                            struct ocfs2_path *path,
3911                                            struct ocfs2_extent_rec *insert_rec)
3912 {
3913         int ret, i, next_free;
3914         struct buffer_head *bh;
3915         struct ocfs2_extent_list *el;
3916         struct ocfs2_extent_rec *rec;
3917
3918         /*
3919          * Update everything except the leaf block.
3920          */
3921         for (i = 0; i < path->p_tree_depth; i++) {
3922                 bh = path->p_node[i].bh;
3923                 el = path->p_node[i].el;
3924
3925                 next_free = le16_to_cpu(el->l_next_free_rec);
3926                 if (next_free == 0) {
3927                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3928                                     "Owner %llu has a bad extent list",
3929                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
3930                         ret = -EIO;
3931                         return;
3932                 }
3933
3934                 rec = &el->l_recs[next_free - 1];
3935
3936                 rec->e_int_clusters = insert_rec->e_cpos;
3937                 le32_add_cpu(&rec->e_int_clusters,
3938                              le16_to_cpu(insert_rec->e_leaf_clusters));
3939                 le32_add_cpu(&rec->e_int_clusters,
3940                              -le32_to_cpu(rec->e_cpos));
3941
3942                 ocfs2_journal_dirty(handle, bh);
3943         }
3944 }
3945
3946 static int ocfs2_append_rec_to_path(handle_t *handle,
3947                                     struct ocfs2_extent_tree *et,
3948                                     struct ocfs2_extent_rec *insert_rec,
3949                                     struct ocfs2_path *right_path,
3950                                     struct ocfs2_path **ret_left_path)
3951 {
3952         int ret, next_free;
3953         struct ocfs2_extent_list *el;
3954         struct ocfs2_path *left_path = NULL;
3955
3956         *ret_left_path = NULL;
3957
3958         /*
3959          * This shouldn't happen for non-trees. The extent rec cluster
3960          * count manipulation below only works for interior nodes.
3961          */
3962         BUG_ON(right_path->p_tree_depth == 0);
3963
3964         /*
3965          * If our appending insert is at the leftmost edge of a leaf,
3966          * then we might need to update the rightmost records of the
3967          * neighboring path.
3968          */
3969         el = path_leaf_el(right_path);
3970         next_free = le16_to_cpu(el->l_next_free_rec);
3971         if (next_free == 0 ||
3972             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3973                 u32 left_cpos;
3974
3975                 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3976                                                     right_path, &left_cpos);
3977                 if (ret) {
3978                         mlog_errno(ret);
3979                         goto out;
3980                 }
3981
3982                 mlog(0, "Append may need a left path update. cpos: %u, "
3983                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3984                      left_cpos);
3985
3986                 /*
3987                  * No need to worry if the append is already in the
3988                  * leftmost leaf.
3989                  */
3990                 if (left_cpos) {
3991                         left_path = ocfs2_new_path_from_path(right_path);
3992                         if (!left_path) {
3993                                 ret = -ENOMEM;
3994                                 mlog_errno(ret);
3995                                 goto out;
3996                         }
3997
3998                         ret = ocfs2_find_path(et->et_ci, left_path,
3999                                               left_cpos);
4000                         if (ret) {
4001                                 mlog_errno(ret);
4002                                 goto out;
4003                         }
4004
4005                         /*
4006                          * ocfs2_insert_path() will pass the left_path to the
4007                          * journal for us.
4008                          */
4009                 }
4010         }
4011
4012         ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4013         if (ret) {
4014                 mlog_errno(ret);
4015                 goto out;
4016         }
4017
4018         ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
4019
4020         *ret_left_path = left_path;
4021         ret = 0;
4022 out:
4023         if (ret != 0)
4024                 ocfs2_free_path(left_path);
4025
4026         return ret;
4027 }
4028
4029 static void ocfs2_split_record(struct ocfs2_extent_tree *et,
4030                                struct ocfs2_path *left_path,
4031                                struct ocfs2_path *right_path,
4032                                struct ocfs2_extent_rec *split_rec,
4033                                enum ocfs2_split_type split)
4034 {
4035         int index;
4036         u32 cpos = le32_to_cpu(split_rec->e_cpos);
4037         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
4038         struct ocfs2_extent_rec *rec, *tmprec;
4039
4040         right_el = path_leaf_el(right_path);
4041         if (left_path)
4042                 left_el = path_leaf_el(left_path);
4043
4044         el = right_el;
4045         insert_el = right_el;
4046         index = ocfs2_search_extent_list(el, cpos);
4047         if (index != -1) {
4048                 if (index == 0 && left_path) {
4049                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
4050
4051                         /*
4052                          * This typically means that the record
4053                          * started in the left path but moved to the
4054                          * right as a result of rotation. We either
4055                          * move the existing record to the left, or we
4056                          * do the later insert there.
4057                          *
4058                          * In this case, the left path should always
4059                          * exist as the rotate code will have passed
4060                          * it back for a post-insert update.
4061                          */
4062
4063                         if (split == SPLIT_LEFT) {
4064                                 /*
4065                                  * It's a left split. Since we know
4066                                  * that the rotate code gave us an
4067                                  * empty extent in the left path, we
4068                                  * can just do the insert there.
4069                                  */
4070                                 insert_el = left_el;
4071                         } else {
4072                                 /*
4073                                  * Right split - we have to move the
4074                                  * existing record over to the left
4075                                  * leaf. The insert will be into the
4076                                  * newly created empty extent in the
4077                                  * right leaf.
4078                                  */
4079                                 tmprec = &right_el->l_recs[index];
4080                                 ocfs2_rotate_leaf(left_el, tmprec);
4081                                 el = left_el;
4082
4083                                 memset(tmprec, 0, sizeof(*tmprec));
4084                                 index = ocfs2_search_extent_list(left_el, cpos);
4085                                 BUG_ON(index == -1);
4086                         }
4087                 }
4088         } else {
4089                 BUG_ON(!left_path);
4090                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
4091                 /*
4092                  * Left path is easy - we can just allow the insert to
4093                  * happen.
4094                  */
4095                 el = left_el;
4096                 insert_el = left_el;
4097                 index = ocfs2_search_extent_list(el, cpos);
4098                 BUG_ON(index == -1);
4099         }
4100
4101         rec = &el->l_recs[index];
4102         ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4103                                 split, rec, split_rec);
4104         ocfs2_rotate_leaf(insert_el, split_rec);
4105 }
4106
4107 /*
4108  * This function only does inserts on an allocation b-tree. For tree
4109  * depth = 0, ocfs2_insert_at_leaf() is called directly.
4110  *
4111  * right_path is the path we want to do the actual insert
4112  * in. left_path should only be passed in if we need to update that
4113  * portion of the tree after an edge insert.
4114  */
4115 static int ocfs2_insert_path(handle_t *handle,
4116                              struct ocfs2_extent_tree *et,
4117                              struct ocfs2_path *left_path,
4118                              struct ocfs2_path *right_path,
4119                              struct ocfs2_extent_rec *insert_rec,
4120                              struct ocfs2_insert_type *insert)
4121 {
4122         int ret, subtree_index;
4123         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
4124
4125         if (left_path) {
4126                 /*
4127                  * There's a chance that left_path got passed back to
4128                  * us without being accounted for in the
4129                  * journal. Extend our transaction here to be sure we
4130                  * can change those blocks.
4131                  */
4132                 ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
4133                 if (ret < 0) {
4134                         mlog_errno(ret);
4135                         goto out;
4136                 }
4137
4138                 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
4139                 if (ret < 0) {
4140                         mlog_errno(ret);
4141                         goto out;
4142                 }
4143         }
4144
4145         /*
4146          * Pass both paths to the journal. The majority of inserts
4147          * will be touching all components anyway.
4148          */
4149         ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4150         if (ret < 0) {
4151                 mlog_errno(ret);
4152                 goto out;
4153         }
4154
4155         if (insert->ins_split != SPLIT_NONE) {
4156                 /*
4157                  * We could call ocfs2_insert_at_leaf() for some types
4158                  * of splits, but it's easier to just let one separate
4159                  * function sort it all out.
4160                  */
4161                 ocfs2_split_record(et, left_path, right_path,
4162                                    insert_rec, insert->ins_split);
4163
4164                 /*
4165                  * Split might have modified either leaf and we don't
4166                  * have a guarantee that the later edge insert will
4167                  * dirty this for us.
4168                  */
4169                 if (left_path)
4170                         ocfs2_journal_dirty(handle,
4171                                             path_leaf_bh(left_path));
4172         } else
4173                 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4174                                      insert);
4175
4176         ocfs2_journal_dirty(handle, leaf_bh);
4177
4178         if (left_path) {
4179                 /*
4180                  * The rotate code has indicated that we need to fix
4181                  * up portions of the tree after the insert.
4182                  *
4183                  * XXX: Should we extend the transaction here?
4184                  */
4185                 subtree_index = ocfs2_find_subtree_root(et, left_path,
4186                                                         right_path);
4187                 ocfs2_complete_edge_insert(handle, left_path, right_path,
4188                                            subtree_index);
4189         }
4190
4191         ret = 0;
4192 out:
4193         return ret;
4194 }
4195
4196 static int ocfs2_do_insert_extent(handle_t *handle,
4197                                   struct ocfs2_extent_tree *et,
4198                                   struct ocfs2_extent_rec *insert_rec,
4199                                   struct ocfs2_insert_type *type)
4200 {
4201         int ret, rotate = 0;
4202         u32 cpos;
4203         struct ocfs2_path *right_path = NULL;
4204         struct ocfs2_path *left_path = NULL;
4205         struct ocfs2_extent_list *el;
4206
4207         el = et->et_root_el;
4208
4209         ret = ocfs2_et_root_journal_access(handle, et,
4210                                            OCFS2_JOURNAL_ACCESS_WRITE);
4211         if (ret) {
4212                 mlog_errno(ret);
4213                 goto out;
4214         }
4215
4216         if (le16_to_cpu(el->l_tree_depth) == 0) {
4217                 ocfs2_insert_at_leaf(et, insert_rec, el, type);
4218                 goto out_update_clusters;
4219         }
4220
4221         right_path = ocfs2_new_path_from_et(et);
4222         if (!right_path) {
4223                 ret = -ENOMEM;
4224                 mlog_errno(ret);
4225                 goto out;
4226         }
4227
4228         /*
4229          * Determine the path to start with. Rotations need the
4230          * rightmost path, everything else can go directly to the
4231          * target leaf.
4232          */
4233         cpos = le32_to_cpu(insert_rec->e_cpos);
4234         if (type->ins_appending == APPEND_NONE &&
4235             type->ins_contig == CONTIG_NONE) {
4236                 rotate = 1;
4237                 cpos = UINT_MAX;
4238         }
4239
4240         ret = ocfs2_find_path(et->et_ci, right_path, cpos);
4241         if (ret) {
4242                 mlog_errno(ret);
4243                 goto out;
4244         }
4245
4246         /*
4247          * Rotations and appends need special treatment - they modify
4248          * parts of the tree's above them.
4249          *
4250          * Both might pass back a path immediate to the left of the
4251          * one being inserted to. This will be cause
4252          * ocfs2_insert_path() to modify the rightmost records of
4253          * left_path to account for an edge insert.
4254          *
4255          * XXX: When modifying this code, keep in mind that an insert
4256          * can wind up skipping both of these two special cases...
4257          */
4258         if (rotate) {
4259                 ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
4260                                               le32_to_cpu(insert_rec->e_cpos),
4261                                               right_path, &left_path);
4262                 if (ret) {
4263                         mlog_errno(ret);
4264                         goto out;
4265                 }
4266
4267                 /*
4268                  * ocfs2_rotate_tree_right() might have extended the
4269                  * transaction without re-journaling our tree root.
4270                  */
4271                 ret = ocfs2_et_root_journal_access(handle, et,
4272                                                    OCFS2_JOURNAL_ACCESS_WRITE);
4273                 if (ret) {
4274                         mlog_errno(ret);
4275                         goto out;
4276                 }
4277         } else if (type->ins_appending == APPEND_TAIL
4278                    && type->ins_contig != CONTIG_LEFT) {
4279                 ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
4280                                                right_path, &left_path);
4281                 if (ret) {
4282                         mlog_errno(ret);
4283                         goto out;
4284                 }
4285         }
4286
4287         ret = ocfs2_insert_path(handle, et, left_path, right_path,
4288                                 insert_rec, type);
4289         if (ret) {
4290                 mlog_errno(ret);
4291                 goto out;
4292         }
4293
4294 out_update_clusters:
4295         if (type->ins_split == SPLIT_NONE)
4296                 ocfs2_et_update_clusters(et,
4297                                          le16_to_cpu(insert_rec->e_leaf_clusters));
4298
4299         ocfs2_journal_dirty(handle, et->et_root_bh);
4300
4301 out:
4302         ocfs2_free_path(left_path);
4303         ocfs2_free_path(right_path);
4304
4305         return ret;
4306 }
4307
4308 static enum ocfs2_contig_type
4309 ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4310                                struct ocfs2_path *path,
4311                                struct ocfs2_extent_list *el, int index,
4312                                struct ocfs2_extent_rec *split_rec)
4313 {
4314         int status;
4315         enum ocfs2_contig_type ret = CONTIG_NONE;
4316         u32 left_cpos, right_cpos;
4317         struct ocfs2_extent_rec *rec = NULL;
4318         struct ocfs2_extent_list *new_el;
4319         struct ocfs2_path *left_path = NULL, *right_path = NULL;
4320         struct buffer_head *bh;
4321         struct ocfs2_extent_block *eb;
4322         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
4323
4324         if (index > 0) {
4325                 rec = &el->l_recs[index - 1];
4326         } else if (path->p_tree_depth > 0) {
4327                 status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
4328                 if (status)
4329                         goto out;
4330
4331                 if (left_cpos != 0) {
4332                         left_path = ocfs2_new_path_from_path(path);
4333                         if (!left_path)
4334                                 goto out;
4335
4336                         status = ocfs2_find_path(et->et_ci, left_path,
4337                                                  left_cpos);
4338                         if (status)
4339                                 goto out;
4340
4341                         new_el = path_leaf_el(left_path);
4342
4343                         if (le16_to_cpu(new_el->l_next_free_rec) !=
4344                             le16_to_cpu(new_el->l_count)) {
4345                                 bh = path_leaf_bh(left_path);
4346                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4347                                 ocfs2_error(sb,
4348                                             "Extent block #%llu has an "
4349                                             "invalid l_next_free_rec of "
4350                                             "%d.  It should have "
4351                                             "matched the l_count of %d",
4352                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4353                                             le16_to_cpu(new_el->l_next_free_rec),
4354                                             le16_to_cpu(new_el->l_count));
4355                                 status = -EINVAL;
4356                                 goto out;
4357                         }
4358                         rec = &new_el->l_recs[
4359                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
4360                 }
4361         }
4362
4363         /*
4364          * We're careful to check for an empty extent record here -
4365          * the merge code will know what to do if it sees one.
4366          */
4367         if (rec) {
4368                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
4369                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4370                                 ret = CONTIG_RIGHT;
4371                 } else {
4372                         ret = ocfs2_et_extent_contig(et, rec, split_rec);
4373                 }
4374         }
4375
4376         rec = NULL;
4377         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4378                 rec = &el->l_recs[index + 1];
4379         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4380                  path->p_tree_depth > 0) {
4381                 status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
4382                 if (status)
4383                         goto out;
4384
4385                 if (right_cpos == 0)
4386                         goto out;
4387
4388                 right_path = ocfs2_new_path_from_path(path);
4389                 if (!right_path)
4390                         goto out;
4391
4392                 status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
4393                 if (status)
4394                         goto out;
4395
4396                 new_el = path_leaf_el(right_path);
4397                 rec = &new_el->l_recs[0];
4398                 if (ocfs2_is_empty_extent(rec)) {
4399                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4400                                 bh = path_leaf_bh(right_path);
4401                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4402                                 ocfs2_error(sb,
4403                                             "Extent block #%llu has an "
4404                                             "invalid l_next_free_rec of %d",
4405                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4406                                             le16_to_cpu(new_el->l_next_free_rec));
4407                                 status = -EINVAL;
4408                                 goto out;
4409                         }
4410                         rec = &new_el->l_recs[1];
4411                 }
4412         }
4413
4414         if (rec) {
4415                 enum ocfs2_contig_type contig_type;
4416
4417                 contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
4418
4419                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4420                         ret = CONTIG_LEFTRIGHT;
4421                 else if (ret == CONTIG_NONE)
4422                         ret = contig_type;
4423         }
4424
4425 out:
4426         if (left_path)
4427                 ocfs2_free_path(left_path);
4428         if (right_path)
4429                 ocfs2_free_path(right_path);
4430
4431         return ret;
4432 }
4433
4434 static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
4435                                      struct ocfs2_insert_type *insert,
4436                                      struct ocfs2_extent_list *el,
4437                                      struct ocfs2_extent_rec *insert_rec)
4438 {
4439         int i;
4440         enum ocfs2_contig_type contig_type = CONTIG_NONE;
4441
4442         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4443
4444         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4445                 contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
4446                                                      insert_rec);
4447                 if (contig_type != CONTIG_NONE) {
4448                         insert->ins_contig_index = i;
4449                         break;
4450                 }
4451         }
4452         insert->ins_contig = contig_type;
4453
4454         if (insert->ins_contig != CONTIG_NONE) {
4455                 struct ocfs2_extent_rec *rec =
4456                                 &el->l_recs[insert->ins_contig_index];
4457                 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4458                                    le16_to_cpu(insert_rec->e_leaf_clusters);
4459
4460                 /*
4461                  * Caller might want us to limit the size of extents, don't
4462                  * calculate contiguousness if we might exceed that limit.
4463                  */
4464                 if (et->et_max_leaf_clusters &&
4465                     (len > et->et_max_leaf_clusters))
4466                         insert->ins_contig = CONTIG_NONE;
4467         }
4468 }
4469
4470 /*
4471  * This should only be called against the righmost leaf extent list.
4472  *
4473  * ocfs2_figure_appending_type() will figure out whether we'll have to
4474  * insert at the tail of the rightmost leaf.
4475  *
4476  * This should also work against the root extent list for tree's with 0
4477  * depth. If we consider the root extent list to be the rightmost leaf node
4478  * then the logic here makes sense.
4479  */
4480 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4481                                         struct ocfs2_extent_list *el,
4482                                         struct ocfs2_extent_rec *insert_rec)
4483 {
4484         int i;
4485         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4486         struct ocfs2_extent_rec *rec;
4487
4488         insert->ins_appending = APPEND_NONE;
4489
4490         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4491
4492         if (!el->l_next_free_rec)
4493                 goto set_tail_append;
4494
4495         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4496                 /* Were all records empty? */
4497                 if (le16_to_cpu(el->l_next_free_rec) == 1)
4498                         goto set_tail_append;
4499         }
4500
4501         i = le16_to_cpu(el->l_next_free_rec) - 1;
4502         rec = &el->l_recs[i];
4503
4504         if (cpos >=
4505             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4506                 goto set_tail_append;
4507
4508         return;
4509
4510 set_tail_append:
4511         insert->ins_appending = APPEND_TAIL;
4512 }
4513
4514 /*
4515  * Helper function called at the begining of an insert.
4516  *
4517  * This computes a few things that are commonly used in the process of
4518  * inserting into the btree:
4519  *   - Whether the new extent is contiguous with an existing one.
4520  *   - The current tree depth.
4521  *   - Whether the insert is an appending one.
4522  *   - The total # of free records in the tree.
4523  *
4524  * All of the information is stored on the ocfs2_insert_type
4525  * structure.
4526  */
4527 static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
4528                                     struct buffer_head **last_eb_bh,
4529                                     struct ocfs2_extent_rec *insert_rec,
4530                                     int *free_records,
4531                                     struct ocfs2_insert_type *insert)
4532 {
4533         int ret;
4534         struct ocfs2_extent_block *eb;
4535         struct ocfs2_extent_list *el;
4536         struct ocfs2_path *path = NULL;
4537         struct buffer_head *bh = NULL;
4538
4539         insert->ins_split = SPLIT_NONE;
4540
4541         el = et->et_root_el;
4542         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4543
4544         if (el->l_tree_depth) {
4545                 /*
4546                  * If we have tree depth, we read in the
4547                  * rightmost extent block ahead of time as
4548                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
4549                  * may want it later.
4550                  */
4551                 ret = ocfs2_read_extent_block(et->et_ci,
4552                                               ocfs2_et_get_last_eb_blk(et),
4553                                               &bh);
4554                 if (ret) {
4555                         mlog_exit(ret);
4556                         goto out;
4557                 }
4558                 eb = (struct ocfs2_extent_block *) bh->b_data;
4559                 el = &eb->h_list;
4560         }
4561
4562         /*
4563          * Unless we have a contiguous insert, we'll need to know if
4564          * there is room left in our allocation tree for another
4565          * extent record.
4566          *
4567          * XXX: This test is simplistic, we can search for empty
4568          * extent records too.
4569          */
4570         *free_records = le16_to_cpu(el->l_count) -
4571                 le16_to_cpu(el->l_next_free_rec);
4572
4573         if (!insert->ins_tree_depth) {
4574                 ocfs2_figure_contig_type(et, insert, el, insert_rec);
4575                 ocfs2_figure_appending_type(insert, el, insert_rec);
4576                 return 0;
4577         }
4578
4579         path = ocfs2_new_path_from_et(et);
4580         if (!path) {
4581                 ret = -ENOMEM;
4582                 mlog_errno(ret);
4583                 goto out;
4584         }
4585
4586         /*
4587          * In the case that we're inserting past what the tree
4588          * currently accounts for, ocfs2_find_path() will return for
4589          * us the rightmost tree path. This is accounted for below in
4590          * the appending code.
4591          */
4592         ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
4593         if (ret) {
4594                 mlog_errno(ret);
4595                 goto out;
4596         }
4597
4598         el = path_leaf_el(path);
4599
4600         /*
4601          * Now that we have the path, there's two things we want to determine:
4602          * 1) Contiguousness (also set contig_index if this is so)
4603          *
4604          * 2) Are we doing an append? We can trivially break this up
4605          *     into two types of appends: simple record append, or a
4606          *     rotate inside the tail leaf.
4607          */
4608         ocfs2_figure_contig_type(et, insert, el, insert_rec);
4609
4610         /*
4611          * The insert code isn't quite ready to deal with all cases of
4612          * left contiguousness. Specifically, if it's an insert into
4613          * the 1st record in a leaf, it will require the adjustment of
4614          * cluster count on the last record of the path directly to it's
4615          * left. For now, just catch that case and fool the layers
4616          * above us. This works just fine for tree_depth == 0, which
4617          * is why we allow that above.
4618          */
4619         if (insert->ins_contig == CONTIG_LEFT &&
4620             insert->ins_contig_index == 0)
4621                 insert->ins_contig = CONTIG_NONE;
4622
4623         /*
4624          * Ok, so we can simply compare against last_eb to figure out
4625          * whether the path doesn't exist. This will only happen in
4626          * the case that we're doing a tail append, so maybe we can
4627          * take advantage of that information somehow.
4628          */
4629         if (ocfs2_et_get_last_eb_blk(et) ==
4630             path_leaf_bh(path)->b_blocknr) {
4631                 /*
4632                  * Ok, ocfs2_find_path() returned us the rightmost
4633                  * tree path. This might be an appending insert. There are
4634                  * two cases:
4635                  *    1) We're doing a true append at the tail:
4636                  *      -This might even be off the end of the leaf
4637                  *    2) We're "appending" by rotating in the tail
4638                  */
4639                 ocfs2_figure_appending_type(insert, el, insert_rec);
4640         }
4641
4642 out:
4643         ocfs2_free_path(path);
4644
4645         if (ret == 0)
4646                 *last_eb_bh = bh;
4647         else
4648                 brelse(bh);
4649         return ret;
4650 }
4651
4652 /*
4653  * Insert an extent into a btree.
4654  *
4655  * The caller needs to update the owning btree's cluster count.
4656  */
4657 int ocfs2_insert_extent(handle_t *handle,
4658                         struct ocfs2_extent_tree *et,
4659                         u32 cpos,
4660                         u64 start_blk,
4661                         u32 new_clusters,
4662                         u8 flags,
4663                         struct ocfs2_alloc_context *meta_ac)
4664 {
4665         int status;
4666         int uninitialized_var(free_records);
4667         struct buffer_head *last_eb_bh = NULL;
4668         struct ocfs2_insert_type insert = {0, };
4669         struct ocfs2_extent_rec rec;
4670
4671         mlog(0, "add %u clusters at position %u to owner %llu\n",
4672              new_clusters, cpos,
4673              (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
4674
4675         memset(&rec, 0, sizeof(rec));
4676         rec.e_cpos = cpu_to_le32(cpos);
4677         rec.e_blkno = cpu_to_le64(start_blk);
4678         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4679         rec.e_flags = flags;
4680         status = ocfs2_et_insert_check(et, &rec);
4681         if (status) {
4682                 mlog_errno(status);
4683                 goto bail;
4684         }
4685
4686         status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
4687                                           &free_records, &insert);
4688         if (status < 0) {
4689                 mlog_errno(status);
4690                 goto bail;
4691         }
4692
4693         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4694              "Insert.contig_index: %d, Insert.free_records: %d, "
4695              "Insert.tree_depth: %d\n",
4696              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4697              free_records, insert.ins_tree_depth);
4698
4699         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4700                 status = ocfs2_grow_tree(handle, et,
4701                                          &insert.ins_tree_depth, &last_eb_bh,
4702                                          meta_ac);
4703                 if (status) {
4704                         mlog_errno(status);
4705                         goto bail;
4706                 }
4707         }
4708
4709         /* Finally, we can add clusters. This might rotate the tree for us. */
4710         status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
4711         if (status < 0)
4712                 mlog_errno(status);
4713         else
4714                 ocfs2_et_extent_map_insert(et, &rec);
4715
4716 bail:
4717         brelse(last_eb_bh);
4718
4719         mlog_exit(status);
4720         return status;
4721 }
4722
4723 /*
4724  * Allcate and add clusters into the extent b-tree.
4725  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4726  * The extent b-tree's root is specified by et, and
4727  * it is not limited to the file storage. Any extent tree can use this
4728  * function if it implements the proper ocfs2_extent_tree.
4729  */
4730 int ocfs2_add_clusters_in_btree(handle_t *handle,
4731                                 struct ocfs2_extent_tree *et,
4732                                 u32 *logical_offset,
4733                                 u32 clusters_to_add,
4734                                 int mark_unwritten,
4735                                 struct ocfs2_alloc_context *data_ac,
4736                                 struct ocfs2_alloc_context *meta_ac,
4737                                 enum ocfs2_alloc_restarted *reason_ret)
4738 {
4739         int status = 0;
4740         int free_extents;
4741         enum ocfs2_alloc_restarted reason = RESTART_NONE;
4742         u32 bit_off, num_bits;
4743         u64 block;
4744         u8 flags = 0;
4745         struct ocfs2_super *osb =
4746                 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
4747
4748         BUG_ON(!clusters_to_add);
4749
4750         if (mark_unwritten)
4751                 flags = OCFS2_EXT_UNWRITTEN;
4752
4753         free_extents = ocfs2_num_free_extents(osb, et);
4754         if (free_extents < 0) {
4755                 status = free_extents;
4756                 mlog_errno(status);
4757                 goto leave;
4758         }
4759
4760         /* there are two cases which could cause us to EAGAIN in the
4761          * we-need-more-metadata case:
4762          * 1) we haven't reserved *any*
4763          * 2) we are so fragmented, we've needed to add metadata too
4764          *    many times. */
4765         if (!free_extents && !meta_ac) {
4766                 mlog(0, "we haven't reserved any metadata!\n");
4767                 status = -EAGAIN;
4768                 reason = RESTART_META;
4769                 goto leave;
4770         } else if ((!free_extents)
4771                    && (ocfs2_alloc_context_bits_left(meta_ac)
4772                        < ocfs2_extend_meta_needed(et->et_root_el))) {
4773                 mlog(0, "filesystem is really fragmented...\n");
4774                 status = -EAGAIN;
4775                 reason = RESTART_META;
4776                 goto leave;
4777         }
4778
4779         status = __ocfs2_claim_clusters(handle, data_ac, 1,
4780                                         clusters_to_add, &bit_off, &num_bits);
4781         if (status < 0) {
4782                 if (status != -ENOSPC)
4783                         mlog_errno(status);
4784                 goto leave;
4785         }
4786
4787         BUG_ON(num_bits > clusters_to_add);
4788
4789         /* reserve our write early -- insert_extent may update the tree root */
4790         status = ocfs2_et_root_journal_access(handle, et,
4791                                               OCFS2_JOURNAL_ACCESS_WRITE);
4792         if (status < 0) {
4793                 mlog_errno(status);
4794                 goto leave;
4795         }
4796
4797         block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4798         mlog(0, "Allocating %u clusters at block %u for owner %llu\n",
4799              num_bits, bit_off,
4800              (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
4801         status = ocfs2_insert_extent(handle, et, *logical_offset, block,
4802                                      num_bits, flags, meta_ac);
4803         if (status < 0) {
4804                 mlog_errno(status);
4805                 goto leave;
4806         }
4807
4808         ocfs2_journal_dirty(handle, et->et_root_bh);
4809
4810         clusters_to_add -= num_bits;
4811         *logical_offset += num_bits;
4812
4813         if (clusters_to_add) {
4814                 mlog(0, "need to alloc once more, wanted = %u\n",
4815                      clusters_to_add);
4816                 status = -EAGAIN;
4817                 reason = RESTART_TRANS;
4818         }
4819
4820 leave:
4821         mlog_exit(status);
4822         if (reason_ret)
4823                 *reason_ret = reason;
4824         return status;
4825 }
4826
4827 static void ocfs2_make_right_split_rec(struct super_block *sb,
4828                                        struct ocfs2_extent_rec *split_rec,
4829                                        u32 cpos,
4830                                        struct ocfs2_extent_rec *rec)
4831 {
4832         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4833         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4834
4835         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4836
4837         split_rec->e_cpos = cpu_to_le32(cpos);
4838         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4839
4840         split_rec->e_blkno = rec->e_blkno;
4841         le64_add_cpu(&split_rec->e_blkno,
4842                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4843
4844         split_rec->e_flags = rec->e_flags;
4845 }
4846
4847 static int ocfs2_split_and_insert(handle_t *handle,
4848                                   struct ocfs2_extent_tree *et,
4849                                   struct ocfs2_path *path,
4850                                   struct buffer_head **last_eb_bh,
4851                                   int split_index,
4852                                   struct ocfs2_extent_rec *orig_split_rec,
4853                                   struct ocfs2_alloc_context *meta_ac)
4854 {
4855         int ret = 0, depth;
4856         unsigned int insert_range, rec_range, do_leftright = 0;
4857         struct ocfs2_extent_rec tmprec;
4858         struct ocfs2_extent_list *rightmost_el;
4859         struct ocfs2_extent_rec rec;
4860         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4861         struct ocfs2_insert_type insert;
4862         struct ocfs2_extent_block *eb;
4863
4864 leftright:
4865         /*
4866          * Store a copy of the record on the stack - it might move
4867          * around as the tree is manipulated below.
4868          */
4869         rec = path_leaf_el(path)->l_recs[split_index];
4870
4871         rightmost_el = et->et_root_el;
4872
4873         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4874         if (depth) {
4875                 BUG_ON(!(*last_eb_bh));
4876                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4877                 rightmost_el = &eb->h_list;
4878         }
4879
4880         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4881             le16_to_cpu(rightmost_el->l_count)) {
4882                 ret = ocfs2_grow_tree(handle, et,
4883                                       &depth, last_eb_bh, meta_ac);
4884                 if (ret) {
4885                         mlog_errno(ret);
4886                         goto out;
4887                 }
4888         }
4889
4890         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4891         insert.ins_appending = APPEND_NONE;
4892         insert.ins_contig = CONTIG_NONE;
4893         insert.ins_tree_depth = depth;
4894
4895         insert_range = le32_to_cpu(split_rec.e_cpos) +
4896                 le16_to_cpu(split_rec.e_leaf_clusters);
4897         rec_range = le32_to_cpu(rec.e_cpos) +
4898                 le16_to_cpu(rec.e_leaf_clusters);
4899
4900         if (split_rec.e_cpos == rec.e_cpos) {
4901                 insert.ins_split = SPLIT_LEFT;
4902         } else if (insert_range == rec_range) {
4903                 insert.ins_split = SPLIT_RIGHT;
4904         } else {
4905                 /*
4906                  * Left/right split. We fake this as a right split
4907                  * first and then make a second pass as a left split.
4908                  */
4909                 insert.ins_split = SPLIT_RIGHT;
4910
4911                 ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4912                                            &tmprec, insert_range, &rec);
4913
4914                 split_rec = tmprec;
4915
4916                 BUG_ON(do_leftright);
4917                 do_leftright = 1;
4918         }
4919
4920         ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
4921         if (ret) {
4922                 mlog_errno(ret);
4923                 goto out;
4924         }
4925
4926         if (do_leftright == 1) {
4927                 u32 cpos;
4928                 struct ocfs2_extent_list *el;
4929
4930                 do_leftright++;
4931                 split_rec = *orig_split_rec;
4932
4933                 ocfs2_reinit_path(path, 1);
4934
4935                 cpos = le32_to_cpu(split_rec.e_cpos);
4936                 ret = ocfs2_find_path(et->et_ci, path, cpos);
4937                 if (ret) {
4938                         mlog_errno(ret);
4939                         goto out;
4940                 }
4941
4942                 el = path_leaf_el(path);
4943                 split_index = ocfs2_search_extent_list(el, cpos);
4944                 goto leftright;
4945         }
4946 out:
4947
4948         return ret;
4949 }
4950
4951 static int ocfs2_replace_extent_rec(handle_t *handle,
4952                                     struct ocfs2_extent_tree *et,
4953                                     struct ocfs2_path *path,
4954                                     struct ocfs2_extent_list *el,
4955                                     int split_index,
4956                                     struct ocfs2_extent_rec *split_rec)
4957 {
4958         int ret;
4959
4960         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
4961                                            path_num_items(path) - 1);
4962         if (ret) {
4963                 mlog_errno(ret);
4964                 goto out;
4965         }
4966
4967         el->l_recs[split_index] = *split_rec;
4968
4969         ocfs2_journal_dirty(handle, path_leaf_bh(path));
4970 out:
4971         return ret;
4972 }
4973
4974 /*
4975  * Split part or all of the extent record at split_index in the leaf
4976  * pointed to by path. Merge with the contiguous extent record if needed.
4977  *
4978  * Care is taken to handle contiguousness so as to not grow the tree.
4979  *
4980  * meta_ac is not strictly necessary - we only truly need it if growth
4981  * of the tree is required. All other cases will degrade into a less
4982  * optimal tree layout.
4983  *
4984  * last_eb_bh should be the rightmost leaf block for any extent
4985  * btree. Since a split may grow the tree or a merge might shrink it,
4986  * the caller cannot trust the contents of that buffer after this call.
4987  *
4988  * This code is optimized for readability - several passes might be
4989  * made over certain portions of the tree. All of those blocks will
4990  * have been brought into cache (and pinned via the journal), so the
4991  * extra overhead is not expressed in terms of disk reads.
4992  */
4993 int ocfs2_split_extent(handle_t *handle,
4994                        struct ocfs2_extent_tree *et,
4995                        struct ocfs2_path *path,
4996                        int split_index,
4997                        struct ocfs2_extent_rec *split_rec,
4998                        struct ocfs2_alloc_context *meta_ac,
4999                        struct ocfs2_cached_dealloc_ctxt *dealloc)
5000 {
5001         int ret = 0;
5002         struct ocfs2_extent_list *el = path_leaf_el(path);
5003         struct buffer_head *last_eb_bh = NULL;
5004         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
5005         struct ocfs2_merge_ctxt ctxt;
5006         struct ocfs2_extent_list *rightmost_el;
5007
5008         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
5009             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
5010              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
5011                 ret = -EIO;
5012                 mlog_errno(ret);
5013                 goto out;
5014         }
5015
5016         ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
5017                                                             split_index,
5018                                                             split_rec);
5019
5020         /*
5021          * The core merge / split code wants to know how much room is
5022          * left in this allocation tree, so we pass the
5023          * rightmost extent list.
5024          */
5025         if (path->p_tree_depth) {
5026                 struct ocfs2_extent_block *eb;
5027
5028                 ret = ocfs2_read_extent_block(et->et_ci,
5029                                               ocfs2_et_get_last_eb_blk(et),
5030                                               &last_eb_bh);
5031                 if (ret) {
5032                         mlog_exit(ret);
5033                         goto out;
5034                 }
5035
5036                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5037                 rightmost_el = &eb->h_list;
5038         } else
5039                 rightmost_el = path_root_el(path);
5040
5041         if (rec->e_cpos == split_rec->e_cpos &&
5042             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
5043                 ctxt.c_split_covers_rec = 1;
5044         else
5045                 ctxt.c_split_covers_rec = 0;
5046
5047         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
5048
5049         mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
5050              split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
5051              ctxt.c_split_covers_rec);
5052
5053         if (ctxt.c_contig_type == CONTIG_NONE) {
5054                 if (ctxt.c_split_covers_rec)
5055                         ret = ocfs2_replace_extent_rec(handle, et, path, el,
5056                                                        split_index, split_rec);
5057                 else
5058                         ret = ocfs2_split_and_insert(handle, et, path,
5059                                                      &last_eb_bh, split_index,
5060                                                      split_rec, meta_ac);
5061                 if (ret)
5062                         mlog_errno(ret);
5063         } else {
5064                 ret = ocfs2_try_to_merge_extent(handle, et, path,
5065                                                 split_index, split_rec,
5066                                                 dealloc, &ctxt);
5067                 if (ret)
5068                         mlog_errno(ret);
5069         }
5070
5071 out:
5072         brelse(last_eb_bh);
5073         return ret;
5074 }
5075
5076 /*
5077  * Change the flags of the already-existing extent at cpos for len clusters.
5078  *
5079  * new_flags: the flags we want to set.
5080  * clear_flags: the flags we want to clear.
5081  * phys: the new physical offset we want this new extent starts from.
5082  *
5083  * If the existing extent is larger than the request, initiate a
5084  * split. An attempt will be made at merging with adjacent extents.
5085  *
5086  * The caller is responsible for passing down meta_ac if we'll need it.
5087  */
5088 int ocfs2_change_extent_flag(handle_t *handle,
5089                              struct ocfs2_extent_tree *et,
5090                              u32 cpos, u32 len, u32 phys,
5091                              struct ocfs2_alloc_context *meta_ac,
5092                              struct ocfs2_cached_dealloc_ctxt *dealloc,
5093                              int new_flags, int clear_flags)
5094 {
5095         int ret, index;
5096         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5097         u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
5098         struct ocfs2_extent_rec split_rec;
5099         struct ocfs2_path *left_path = NULL;
5100         struct ocfs2_extent_list *el;
5101         struct ocfs2_extent_rec *rec;
5102
5103         left_path = ocfs2_new_path_from_et(et);
5104         if (!left_path) {
5105                 ret = -ENOMEM;
5106                 mlog_errno(ret);
5107                 goto out;
5108         }
5109
5110         ret = ocfs2_find_path(et->et_ci, left_path, cpos);
5111         if (ret) {
5112                 mlog_errno(ret);
5113                 goto out;
5114         }
5115         el = path_leaf_el(left_path);
5116
5117         index = ocfs2_search_extent_list(el, cpos);
5118         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5119                 ocfs2_error(sb,
5120                             "Owner %llu has an extent at cpos %u which can no "
5121                             "longer be found.\n",
5122                              (unsigned long long)
5123                              ocfs2_metadata_cache_owner(et->et_ci), cpos);
5124                 ret = -EROFS;
5125                 goto out;
5126         }
5127
5128         ret = -EIO;
5129         rec = &el->l_recs[index];
5130         if (new_flags && (rec->e_flags & new_flags)) {
5131                 mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
5132                      "extent that already had them",
5133                      (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5134                      new_flags);
5135                 goto out;
5136         }
5137
5138         if (clear_flags && !(rec->e_flags & clear_flags)) {
5139                 mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
5140                      "extent that didn't have them",
5141                      (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5142                      clear_flags);
5143                 goto out;
5144         }
5145
5146         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
5147         split_rec.e_cpos = cpu_to_le32(cpos);
5148         split_rec.e_leaf_clusters = cpu_to_le16(len);
5149         split_rec.e_blkno = cpu_to_le64(start_blkno);
5150         split_rec.e_flags = rec->e_flags;
5151         if (new_flags)
5152                 split_rec.e_flags |= new_flags;
5153         if (clear_flags)
5154                 split_rec.e_flags &= ~clear_flags;
5155
5156         ret = ocfs2_split_extent(handle, et, left_path,
5157                                  index, &split_rec, meta_ac,
5158                                  dealloc);
5159         if (ret)
5160                 mlog_errno(ret);
5161
5162 out:
5163         ocfs2_free_path(left_path);
5164         return ret;
5165
5166 }
5167
5168 /*
5169  * Mark the already-existing extent at cpos as written for len clusters.
5170  * This removes the unwritten extent flag.
5171  *
5172  * If the existing extent is larger than the request, initiate a
5173  * split. An attempt will be made at merging with adjacent extents.
5174  *
5175  * The caller is responsible for passing down meta_ac if we'll need it.
5176  */
5177 int ocfs2_mark_extent_written(struct inode *inode,
5178                               struct ocfs2_extent_tree *et,
5179                               handle_t *handle, u32 cpos, u32 len, u32 phys,
5180                               struct ocfs2_alloc_context *meta_ac,
5181                               struct ocfs2_cached_dealloc_ctxt *dealloc)
5182 {
5183         int ret;
5184
5185         mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n",
5186              inode->i_ino, cpos, len, phys);
5187
5188         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5189                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
5190                             "that are being written to, but the feature bit "
5191                             "is not set in the super block.",
5192                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
5193                 ret = -EROFS;
5194                 goto out;
5195         }
5196
5197         /*
5198          * XXX: This should be fixed up so that we just re-insert the
5199          * next extent records.
5200          */
5201         ocfs2_et_extent_map_truncate(et, 0);
5202
5203         ret = ocfs2_change_extent_flag(handle, et, cpos,
5204                                        len, phys, meta_ac, dealloc,
5205                                        0, OCFS2_EXT_UNWRITTEN);
5206         if (ret)
5207                 mlog_errno(ret);
5208
5209 out:
5210         return ret;
5211 }
5212
5213 static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5214                             struct ocfs2_path *path,
5215                             int index, u32 new_range,
5216                             struct ocfs2_alloc_context *meta_ac)
5217 {
5218         int ret, depth, credits;
5219         struct buffer_head *last_eb_bh = NULL;
5220         struct ocfs2_extent_block *eb;
5221         struct ocfs2_extent_list *rightmost_el, *el;
5222         struct ocfs2_extent_rec split_rec;
5223         struct ocfs2_extent_rec *rec;
5224         struct ocfs2_insert_type insert;
5225
5226         /*
5227          * Setup the record to split before we grow the tree.
5228          */
5229         el = path_leaf_el(path);
5230         rec = &el->l_recs[index];
5231         ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
5232                                    &split_rec, new_range, rec);
5233
5234         depth = path->p_tree_depth;
5235         if (depth > 0) {
5236                 ret = ocfs2_read_extent_block(et->et_ci,
5237                                               ocfs2_et_get_last_eb_blk(et),
5238                                               &last_eb_bh);
5239                 if (ret < 0) {
5240                         mlog_errno(ret);
5241                         goto out;
5242                 }
5243
5244                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5245                 rightmost_el = &eb->h_list;
5246         } else
5247                 rightmost_el = path_leaf_el(path);
5248
5249         credits = path->p_tree_depth +
5250                   ocfs2_extend_meta_needed(et->et_root_el);
5251         ret = ocfs2_extend_trans(handle, credits);
5252         if (ret) {
5253                 mlog_errno(ret);
5254                 goto out;
5255         }
5256
5257         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5258             le16_to_cpu(rightmost_el->l_count)) {
5259                 ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
5260                                       meta_ac);
5261                 if (ret) {
5262                         mlog_errno(ret);
5263                         goto out;
5264                 }
5265         }
5266
5267         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
5268         insert.ins_appending = APPEND_NONE;
5269         insert.ins_contig = CONTIG_NONE;
5270         insert.ins_split = SPLIT_RIGHT;
5271         insert.ins_tree_depth = depth;
5272
5273         ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
5274         if (ret)
5275                 mlog_errno(ret);
5276
5277 out:
5278         brelse(last_eb_bh);
5279         return ret;
5280 }
5281
5282 static int ocfs2_truncate_rec(handle_t *handle,
5283                               struct ocfs2_extent_tree *et,
5284                               struct ocfs2_path *path, int index,
5285                               struct ocfs2_cached_dealloc_ctxt *dealloc,
5286                               u32 cpos, u32 len)
5287 {
5288         int ret;
5289         u32 left_cpos, rec_range, trunc_range;
5290         int wants_rotate = 0, is_rightmost_tree_rec = 0;
5291         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5292         struct ocfs2_path *left_path = NULL;
5293         struct ocfs2_extent_list *el = path_leaf_el(path);
5294         struct ocfs2_extent_rec *rec;
5295         struct ocfs2_extent_block *eb;
5296
5297         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5298                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5299                 if (ret) {
5300                         mlog_errno(ret);
5301                         goto out;
5302                 }
5303
5304                 index--;
5305         }
5306
5307         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5308             path->p_tree_depth) {
5309                 /*
5310                  * Check whether this is the rightmost tree record. If
5311                  * we remove all of this record or part of its right
5312                  * edge then an update of the record lengths above it
5313                  * will be required.
5314                  */
5315                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5316                 if (eb->h_next_leaf_blk == 0)
5317                         is_rightmost_tree_rec = 1;
5318         }
5319
5320         rec = &el->l_recs[index];
5321         if (index == 0 && path->p_tree_depth &&
5322             le32_to_cpu(rec->e_cpos) == cpos) {
5323                 /*
5324                  * Changing the leftmost offset (via partial or whole
5325                  * record truncate) of an interior (or rightmost) path
5326                  * means we have to update the subtree that is formed
5327                  * by this leaf and the one to it's left.
5328                  *
5329                  * There are two cases we can skip:
5330                  *   1) Path is the leftmost one in our btree.
5331                  *   2) The leaf is rightmost and will be empty after
5332                  *      we remove the extent record - the rotate code
5333                  *      knows how to update the newly formed edge.
5334                  */
5335
5336                 ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
5337                 if (ret) {
5338                         mlog_errno(ret);
5339                         goto out;
5340                 }
5341
5342                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5343                         left_path = ocfs2_new_path_from_path(path);
5344                         if (!left_path) {
5345                                 ret = -ENOMEM;
5346                                 mlog_errno(ret);
5347                                 goto out;
5348                         }
5349
5350                         ret = ocfs2_find_path(et->et_ci, left_path,
5351                                               left_cpos);
5352                         if (ret) {
5353                                 mlog_errno(ret);
5354                                 goto out;
5355                         }
5356                 }
5357         }
5358
5359         ret = ocfs2_extend_rotate_transaction(handle, 0,
5360                                               handle->h_buffer_credits,
5361                                               path);
5362         if (ret) {
5363                 mlog_errno(ret);
5364                 goto out;
5365         }
5366
5367         ret = ocfs2_journal_access_path(et->et_ci, handle, path);
5368         if (ret) {
5369                 mlog_errno(ret);
5370                 goto out;
5371         }
5372
5373         ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
5374         if (ret) {
5375                 mlog_errno(ret);
5376                 goto out;
5377         }
5378
5379         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5380         trunc_range = cpos + len;
5381
5382         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5383                 int next_free;
5384
5385                 memset(rec, 0, sizeof(*rec));
5386                 ocfs2_cleanup_merge(el, index);
5387                 wants_rotate = 1;
5388
5389                 next_free = le16_to_cpu(el->l_next_free_rec);
5390                 if (is_rightmost_tree_rec && next_free > 1) {
5391                         /*
5392                          * We skip the edge update if this path will
5393                          * be deleted by the rotate code.
5394                          */
5395                         rec = &el->l_recs[next_free - 1];
5396                         ocfs2_adjust_rightmost_records(handle, et, path,
5397                                                        rec);
5398                 }
5399         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5400                 /* Remove leftmost portion of the record. */
5401                 le32_add_cpu(&rec->e_cpos, len);
5402                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5403                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5404         } else if (rec_range == trunc_range) {
5405                 /* Remove rightmost portion of the record */
5406                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5407                 if (is_rightmost_tree_rec)
5408                         ocfs2_adjust_rightmost_records(handle, et, path, rec);
5409         } else {
5410                 /* Caller should have trapped this. */
5411                 mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
5412                      "(%u, %u)\n",
5413                      (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5414                      le32_to_cpu(rec->e_cpos),
5415                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5416                 BUG();
5417         }
5418
5419         if (left_path) {
5420                 int subtree_index;
5421
5422                 subtree_index = ocfs2_find_subtree_root(et, left_path, path);
5423                 ocfs2_complete_edge_insert(handle, left_path, path,
5424                                            subtree_index);
5425         }
5426
5427         ocfs2_journal_dirty(handle, path_leaf_bh(path));
5428
5429         ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5430         if (ret) {
5431                 mlog_errno(ret);
5432                 goto out;
5433         }
5434
5435 out:
5436         ocfs2_free_path(left_path);
5437         return ret;
5438 }
5439
5440 int ocfs2_remove_extent(handle_t *handle,
5441                         struct ocfs2_extent_tree *et,
5442                         u32 cpos, u32 len,
5443                         struct ocfs2_alloc_context *meta_ac,
5444                         struct ocfs2_cached_dealloc_ctxt *dealloc)
5445 {
5446         int ret, index;
5447         u32 rec_range, trunc_range;
5448         struct ocfs2_extent_rec *rec;
5449         struct ocfs2_extent_list *el;
5450         struct ocfs2_path *path = NULL;
5451
5452         /*
5453          * XXX: Why are we truncating to 0 instead of wherever this
5454          * affects us?
5455          */
5456         ocfs2_et_extent_map_truncate(et, 0);
5457
5458         path = ocfs2_new_path_from_et(et);
5459         if (!path) {
5460                 ret = -ENOMEM;
5461                 mlog_errno(ret);
5462                 goto out;
5463         }
5464
5465         ret = ocfs2_find_path(et->et_ci, path, cpos);
5466         if (ret) {
5467                 mlog_errno(ret);
5468                 goto out;
5469         }
5470
5471         el = path_leaf_el(path);
5472         index = ocfs2_search_extent_list(el, cpos);
5473         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5474                 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5475                             "Owner %llu has an extent at cpos %u which can no "
5476                             "longer be found.\n",
5477                             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5478                             cpos);
5479                 ret = -EROFS;
5480                 goto out;
5481         }
5482
5483         /*
5484          * We have 3 cases of extent removal:
5485          *   1) Range covers the entire extent rec
5486          *   2) Range begins or ends on one edge of the extent rec
5487          *   3) Range is in the middle of the extent rec (no shared edges)
5488          *
5489          * For case 1 we remove the extent rec and left rotate to
5490          * fill the hole.
5491          *
5492          * For case 2 we just shrink the existing extent rec, with a
5493          * tree update if the shrinking edge is also the edge of an
5494          * extent block.
5495          *
5496          * For case 3 we do a right split to turn the extent rec into
5497          * something case 2 can handle.
5498          */
5499         rec = &el->l_recs[index];
5500         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5501         trunc_range = cpos + len;
5502
5503         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5504
5505         mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d "
5506              "(cpos %u, len %u)\n",
5507              (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5508              cpos, len, index,
5509              le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5510
5511         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5512                 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5513                                          cpos, len);
5514                 if (ret) {
5515                         mlog_errno(ret);
5516                         goto out;
5517                 }
5518         } else {
5519                 ret = ocfs2_split_tree(handle, et, path, index,
5520                                        trunc_range, meta_ac);
5521                 if (ret) {
5522                         mlog_errno(ret);
5523                         goto out;
5524                 }
5525
5526                 /*
5527                  * The split could have manipulated the tree enough to
5528                  * move the record location, so we have to look for it again.
5529                  */
5530                 ocfs2_reinit_path(path, 1);
5531
5532                 ret = ocfs2_find_path(et->et_ci, path, cpos);
5533                 if (ret) {
5534                         mlog_errno(ret);
5535                         goto out;
5536                 }
5537
5538                 el = path_leaf_el(path);
5539                 index = ocfs2_search_extent_list(el, cpos);
5540                 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5541                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5542                                     "Owner %llu: split at cpos %u lost record.",
5543                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5544                                     cpos);
5545                         ret = -EROFS;
5546                         goto out;
5547                 }
5548
5549                 /*
5550                  * Double check our values here. If anything is fishy,
5551                  * it's easier to catch it at the top level.
5552                  */
5553                 rec = &el->l_recs[index];
5554                 rec_range = le32_to_cpu(rec->e_cpos) +
5555                         ocfs2_rec_clusters(el, rec);
5556                 if (rec_range != trunc_range) {
5557                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5558                                     "Owner %llu: error after split at cpos %u"
5559                                     "trunc len %u, existing record is (%u,%u)",
5560                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5561                                     cpos, len, le32_to_cpu(rec->e_cpos),
5562                                     ocfs2_rec_clusters(el, rec));
5563                         ret = -EROFS;
5564                         goto out;
5565                 }
5566
5567                 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5568                                          cpos, len);
5569                 if (ret) {
5570                         mlog_errno(ret);
5571                         goto out;
5572                 }
5573         }
5574
5575 out:
5576         ocfs2_free_path(path);
5577         return ret;
5578 }
5579
5580 /*
5581  * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
5582  * same as ocfs2_lock_alloctors(), except for it accepts a blocks
5583  * number to reserve some extra blocks, and it only handles meta
5584  * data allocations.
5585  *
5586  * Currently, only ocfs2_remove_btree_range() uses it for truncating
5587  * and punching holes.
5588  */
5589 static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
5590                                               struct ocfs2_extent_tree *et,
5591                                               u32 extents_to_split,
5592                                               struct ocfs2_alloc_context **ac,
5593                                               int extra_blocks)
5594 {
5595         int ret = 0, num_free_extents;
5596         unsigned int max_recs_needed = 2 * extents_to_split;
5597         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5598
5599         *ac = NULL;
5600
5601         num_free_extents = ocfs2_num_free_extents(osb, et);
5602         if (num_free_extents < 0) {
5603                 ret = num_free_extents;
5604                 mlog_errno(ret);
5605                 goto out;
5606         }
5607
5608         if (!num_free_extents ||
5609             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
5610                 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
5611
5612         if (extra_blocks) {
5613                 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
5614                 if (ret < 0) {
5615                         if (ret != -ENOSPC)
5616                                 mlog_errno(ret);
5617                         goto out;
5618                 }
5619         }
5620
5621 out:
5622         if (ret) {
5623                 if (*ac) {
5624                         ocfs2_free_alloc_context(*ac);
5625                         *ac = NULL;
5626                 }
5627         }
5628
5629         return ret;
5630 }
5631
5632 int ocfs2_remove_btree_range(struct inode *inode,
5633                              struct ocfs2_extent_tree *et,
5634                              u32 cpos, u32 phys_cpos, u32 len, int flags,
5635                              struct ocfs2_cached_dealloc_ctxt *dealloc,
5636                              u64 refcount_loc)
5637 {
5638         int ret, credits = 0, extra_blocks = 0;
5639         u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5640         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5641         struct inode *tl_inode = osb->osb_tl_inode;
5642         handle_t *handle;
5643         struct ocfs2_alloc_context *meta_ac = NULL;
5644         struct ocfs2_refcount_tree *ref_tree = NULL;
5645
5646         if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
5647                 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
5648                          OCFS2_HAS_REFCOUNT_FL));
5649
5650                 ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
5651                                                &ref_tree, NULL);
5652                 if (ret) {
5653                         mlog_errno(ret);
5654                         goto out;
5655                 }
5656
5657                 ret = ocfs2_prepare_refcount_change_for_del(inode,
5658                                                             refcount_loc,
5659                                                             phys_blkno,
5660                                                             len,
5661                                                             &credits,
5662                                                             &extra_blocks);
5663                 if (ret < 0) {
5664                         mlog_errno(ret);
5665                         goto out;
5666                 }
5667         }
5668
5669         ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
5670                                                  extra_blocks);
5671         if (ret) {
5672                 mlog_errno(ret);
5673                 return ret;
5674         }
5675
5676         mutex_lock(&tl_inode->i_mutex);
5677
5678         if (ocfs2_truncate_log_needs_flush(osb)) {
5679                 ret = __ocfs2_flush_truncate_log(osb);
5680                 if (ret < 0) {
5681                         mlog_errno(ret);
5682                         goto out;
5683                 }
5684         }
5685
5686         handle = ocfs2_start_trans(osb,
5687                         ocfs2_remove_extent_credits(osb->sb) + credits);
5688         if (IS_ERR(handle)) {
5689                 ret = PTR_ERR(handle);
5690                 mlog_errno(ret);
5691                 goto out;
5692         }
5693
5694         ret = ocfs2_et_root_journal_access(handle, et,
5695                                            OCFS2_JOURNAL_ACCESS_WRITE);
5696         if (ret) {
5697                 mlog_errno(ret);
5698                 goto out;
5699         }
5700
5701         dquot_free_space_nodirty(inode,
5702                                   ocfs2_clusters_to_bytes(inode->i_sb, len));
5703
5704         ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
5705         if (ret) {
5706                 mlog_errno(ret);
5707                 goto out_commit;
5708         }
5709
5710         ocfs2_et_update_clusters(et, -len);
5711
5712         ocfs2_journal_dirty(handle, et->et_root_bh);
5713
5714         if (phys_blkno) {
5715                 if (flags & OCFS2_EXT_REFCOUNTED)
5716                         ret = ocfs2_decrease_refcount(inode, handle,
5717                                         ocfs2_blocks_to_clusters(osb->sb,
5718                                                                  phys_blkno),
5719                                         len, meta_ac,
5720                                         dealloc, 1);
5721                 else
5722                         ret = ocfs2_truncate_log_append(osb, handle,
5723                                                         phys_blkno, len);
5724                 if (ret)
5725                         mlog_errno(ret);
5726
5727         }
5728
5729 out_commit:
5730         ocfs2_commit_trans(osb, handle);
5731 out:
5732         mutex_unlock(&tl_inode->i_mutex);
5733
5734         if (meta_ac)
5735                 ocfs2_free_alloc_context(meta_ac);
5736
5737         if (ref_tree)
5738                 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
5739
5740         return ret;
5741 }
5742
5743 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5744 {
5745         struct buffer_head *tl_bh = osb->osb_tl_bh;
5746         struct ocfs2_dinode *di;
5747         struct ocfs2_truncate_log *tl;
5748
5749         di = (struct ocfs2_dinode *) tl_bh->b_data;
5750         tl = &di->id2.i_dealloc;
5751
5752         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5753                         "slot %d, invalid truncate log parameters: used = "
5754                         "%u, count = %u\n", osb->slot_num,
5755                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5756         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5757 }
5758
5759 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5760                                            unsigned int new_start)
5761 {
5762         unsigned int tail_index;
5763         unsigned int current_tail;
5764
5765         /* No records, nothing to coalesce */
5766         if (!le16_to_cpu(tl->tl_used))
5767                 return 0;
5768
5769         tail_index = le16_to_cpu(tl->tl_used) - 1;
5770         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5771         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5772
5773         return current_tail == new_start;
5774 }
5775
5776 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5777                               handle_t *handle,
5778                               u64 start_blk,
5779                               unsigned int num_clusters)
5780 {
5781         int status, index;
5782         unsigned int start_cluster, tl_count;
5783         struct inode *tl_inode = osb->osb_tl_inode;
5784         struct buffer_head *tl_bh = osb->osb_tl_bh;
5785         struct ocfs2_dinode *di;
5786         struct ocfs2_truncate_log *tl;
5787
5788         mlog(0, "start_blk = %llu, num_clusters = %u\n",
5789              (unsigned long long)start_blk, num_clusters);
5790
5791         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5792
5793         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5794
5795         di = (struct ocfs2_dinode *) tl_bh->b_data;
5796
5797         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5798          * by the underlying call to ocfs2_read_inode_block(), so any
5799          * corruption is a code bug */
5800         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5801
5802         tl = &di->id2.i_dealloc;
5803         tl_count = le16_to_cpu(tl->tl_count);
5804         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5805                         tl_count == 0,
5806                         "Truncate record count on #%llu invalid "
5807                         "wanted %u, actual %u\n",
5808                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5809                         ocfs2_truncate_recs_per_inode(osb->sb),
5810                         le16_to_cpu(tl->tl_count));
5811
5812         /* Caller should have known to flush before calling us. */
5813         index = le16_to_cpu(tl->tl_used);
5814         if (index >= tl_count) {
5815                 status = -ENOSPC;
5816                 mlog_errno(status);
5817                 goto bail;
5818         }
5819
5820         status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5821                                          OCFS2_JOURNAL_ACCESS_WRITE);
5822         if (status < 0) {
5823                 mlog_errno(status);
5824                 goto bail;
5825         }
5826
5827         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5828              "%llu (index = %d)\n", num_clusters, start_cluster,
5829              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5830
5831         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5832                 /*
5833                  * Move index back to the record we are coalescing with.
5834                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5835                  */
5836                 index--;
5837
5838                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5839                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5840                      index, le32_to_cpu(tl->tl_recs[index].t_start),
5841                      num_clusters);
5842         } else {
5843                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5844                 tl->tl_used = cpu_to_le16(index + 1);
5845         }
5846         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5847
5848         ocfs2_journal_dirty(handle, tl_bh);
5849
5850         osb->truncated_clusters += num_clusters;
5851 bail:
5852         mlog_exit(status);
5853         return status;
5854 }
5855
5856 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5857                                          handle_t *handle,
5858                                          struct inode *data_alloc_inode,
5859                                          struct buffer_head *data_alloc_bh)
5860 {
5861         int status = 0;
5862         int i;
5863         unsigned int num_clusters;
5864         u64 start_blk;
5865         struct ocfs2_truncate_rec rec;
5866         struct ocfs2_dinode *di;
5867         struct ocfs2_truncate_log *tl;
5868         struct inode *tl_inode = osb->osb_tl_inode;
5869         struct buffer_head *tl_bh = osb->osb_tl_bh;
5870
5871         di = (struct ocfs2_dinode *) tl_bh->b_data;
5872         tl = &di->id2.i_dealloc;
5873         i = le16_to_cpu(tl->tl_used) - 1;
5874         while (i >= 0) {
5875                 /* Caller has given us at least enough credits to
5876                  * update the truncate log dinode */
5877                 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5878                                                  OCFS2_JOURNAL_ACCESS_WRITE);
5879                 if (status < 0) {
5880                         mlog_errno(status);
5881                         goto bail;
5882                 }
5883
5884                 tl->tl_used = cpu_to_le16(i);
5885
5886                 ocfs2_journal_dirty(handle, tl_bh);
5887
5888                 /* TODO: Perhaps we can calculate the bulk of the
5889                  * credits up front rather than extending like
5890                  * this. */
5891                 status = ocfs2_extend_trans(handle,
5892                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5893                 if (status < 0) {
5894                         mlog_errno(status);
5895                         goto bail;
5896                 }
5897
5898                 rec = tl->tl_recs[i];
5899                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5900                                                     le32_to_cpu(rec.t_start));
5901                 num_clusters = le32_to_cpu(rec.t_clusters);
5902
5903                 /* if start_blk is not set, we ignore the record as
5904                  * invalid. */
5905                 if (start_blk) {
5906                         mlog(0, "free record %d, start = %u, clusters = %u\n",
5907                              i, le32_to_cpu(rec.t_start), num_clusters);
5908
5909                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5910                                                      data_alloc_bh, start_blk,
5911                                                      num_clusters);
5912                         if (status < 0) {
5913                                 mlog_errno(status);
5914                                 goto bail;
5915                         }
5916                 }
5917                 i--;
5918         }
5919
5920         osb->truncated_clusters = 0;
5921
5922 bail:
5923         mlog_exit(status);
5924         return status;
5925 }
5926
5927 /* Expects you to already be holding tl_inode->i_mutex */
5928 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5929 {
5930         int status;
5931         unsigned int num_to_flush;
5932         handle_t *handle;
5933         struct inode *tl_inode = osb->osb_tl_inode;
5934         struct inode *data_alloc_inode = NULL;
5935         struct buffer_head *tl_bh = osb->osb_tl_bh;
5936         struct buffer_head *data_alloc_bh = NULL;
5937         struct ocfs2_dinode *di;
5938         struct ocfs2_truncate_log *tl;
5939
5940         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5941
5942         di = (struct ocfs2_dinode *) tl_bh->b_data;
5943
5944         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5945          * by the underlying call to ocfs2_read_inode_block(), so any
5946          * corruption is a code bug */
5947         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5948
5949         tl = &di->id2.i_dealloc;
5950         num_to_flush = le16_to_cpu(tl->tl_used);
5951         mlog(0, "Flush %u records from truncate log #%llu\n",
5952              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5953         if (!num_to_flush) {
5954                 status = 0;
5955                 goto out;
5956         }
5957
5958         data_alloc_inode = ocfs2_get_system_file_inode(osb,
5959                                                        GLOBAL_BITMAP_SYSTEM_INODE,
5960                                                        OCFS2_INVALID_SLOT);
5961         if (!data_alloc_inode) {
5962                 status = -EINVAL;
5963                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
5964                 goto out;
5965         }
5966
5967         mutex_lock(&data_alloc_inode->i_mutex);
5968
5969         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5970         if (status < 0) {
5971                 mlog_errno(status);
5972                 goto out_mutex;
5973         }
5974
5975         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5976         if (IS_ERR(handle)) {
5977                 status = PTR_ERR(handle);
5978                 mlog_errno(status);
5979                 goto out_unlock;
5980         }
5981
5982         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5983                                                data_alloc_bh);
5984         if (status < 0)
5985                 mlog_errno(status);
5986
5987         ocfs2_commit_trans(osb, handle);
5988
5989 out_unlock:
5990         brelse(data_alloc_bh);
5991         ocfs2_inode_unlock(data_alloc_inode, 1);
5992
5993 out_mutex:
5994         mutex_unlock(&data_alloc_inode->i_mutex);
5995         iput(data_alloc_inode);
5996
5997 out:
5998         mlog_exit(status);
5999         return status;
6000 }
6001
6002 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
6003 {
6004         int status;
6005         struct inode *tl_inode = osb->osb_tl_inode;
6006
6007         mutex_lock(&tl_inode->i_mutex);
6008         status = __ocfs2_flush_truncate_log(osb);
6009         mutex_unlock(&tl_inode->i_mutex);
6010
6011         return status;
6012 }
6013
6014 static void ocfs2_truncate_log_worker(struct work_struct *work)
6015 {
6016         int status;
6017         struct ocfs2_super *osb =
6018                 container_of(work, struct ocfs2_super,
6019                              osb_truncate_log_wq.work);
6020
6021         status = ocfs2_flush_truncate_log(osb);
6022         if (status < 0)
6023                 mlog_errno(status);
6024         else
6025                 ocfs2_init_steal_slots(osb);
6026
6027         mlog_exit(status);
6028 }
6029
6030 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
6031 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
6032                                        int cancel)
6033 {
6034         if (osb->osb_tl_inode) {
6035                 /* We want to push off log flushes while truncates are
6036                  * still running. */
6037                 if (cancel)
6038                         cancel_delayed_work(&osb->osb_truncate_log_wq);
6039
6040                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
6041                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
6042         }
6043 }
6044
6045 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
6046                                        int slot_num,
6047                                        struct inode **tl_inode,
6048                                        struct buffer_head **tl_bh)
6049 {
6050         int status;
6051         struct inode *inode = NULL;
6052         struct buffer_head *bh = NULL;
6053
6054         inode = ocfs2_get_system_file_inode(osb,
6055                                            TRUNCATE_LOG_SYSTEM_INODE,
6056                                            slot_num);
6057         if (!inode) {
6058                 status = -EINVAL;
6059                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
6060                 goto bail;
6061         }
6062
6063         status = ocfs2_read_inode_block(inode, &bh);
6064         if (status < 0) {
6065                 iput(inode);
6066                 mlog_errno(status);
6067                 goto bail;
6068         }
6069
6070         *tl_inode = inode;
6071         *tl_bh    = bh;
6072 bail:
6073         mlog_exit(status);
6074         return status;
6075 }
6076
6077 /* called during the 1st stage of node recovery. we stamp a clean
6078  * truncate log and pass back a copy for processing later. if the
6079  * truncate log does not require processing, a *tl_copy is set to
6080  * NULL. */
6081 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
6082                                       int slot_num,
6083                                       struct ocfs2_dinode **tl_copy)
6084 {
6085         int status;
6086         struct inode *tl_inode = NULL;
6087         struct buffer_head *tl_bh = NULL;
6088         struct ocfs2_dinode *di;
6089         struct ocfs2_truncate_log *tl;
6090
6091         *tl_copy = NULL;
6092
6093         mlog(0, "recover truncate log from slot %d\n", slot_num);
6094
6095         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
6096         if (status < 0) {
6097                 mlog_errno(status);
6098                 goto bail;
6099         }
6100
6101         di = (struct ocfs2_dinode *) tl_bh->b_data;
6102
6103         /* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
6104          * validated by the underlying call to ocfs2_read_inode_block(),
6105          * so any corruption is a code bug */
6106         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
6107
6108         tl = &di->id2.i_dealloc;
6109         if (le16_to_cpu(tl->tl_used)) {
6110                 mlog(0, "We'll have %u logs to recover\n",
6111                      le16_to_cpu(tl->tl_used));
6112
6113                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
6114                 if (!(*tl_copy)) {
6115                         status = -ENOMEM;
6116                         mlog_errno(status);
6117                         goto bail;
6118                 }
6119
6120                 /* Assuming the write-out below goes well, this copy
6121                  * will be passed back to recovery for processing. */
6122                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
6123
6124                 /* All we need to do to clear the truncate log is set
6125                  * tl_used. */
6126                 tl->tl_used = 0;
6127
6128                 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
6129                 status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
6130                 if (status < 0) {
6131                         mlog_errno(status);
6132                         goto bail;
6133                 }
6134         }
6135
6136 bail:
6137         if (tl_inode)
6138                 iput(tl_inode);
6139         brelse(tl_bh);
6140
6141         if (status < 0 && (*tl_copy)) {
6142                 kfree(*tl_copy);
6143                 *tl_copy = NULL;
6144         }
6145
6146         mlog_exit(status);
6147         return status;
6148 }
6149
6150 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
6151                                          struct ocfs2_dinode *tl_copy)
6152 {
6153         int status = 0;
6154         int i;
6155         unsigned int clusters, num_recs, start_cluster;
6156         u64 start_blk;
6157         handle_t *handle;
6158         struct inode *tl_inode = osb->osb_tl_inode;
6159         struct ocfs2_truncate_log *tl;
6160
6161         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
6162                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
6163                 return -EINVAL;
6164         }
6165
6166         tl = &tl_copy->id2.i_dealloc;
6167         num_recs = le16_to_cpu(tl->tl_used);
6168         mlog(0, "cleanup %u records from %llu\n", num_recs,
6169              (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
6170
6171         mutex_lock(&tl_inode->i_mutex);
6172         for(i = 0; i < num_recs; i++) {
6173                 if (ocfs2_truncate_log_needs_flush(osb)) {
6174                         status = __ocfs2_flush_truncate_log(osb);
6175                         if (status < 0) {
6176                                 mlog_errno(status);
6177                                 goto bail_up;
6178                         }
6179                 }
6180
6181                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6182                 if (IS_ERR(handle)) {
6183                         status = PTR_ERR(handle);
6184                         mlog_errno(status);
6185                         goto bail_up;
6186                 }
6187
6188                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
6189                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
6190                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
6191
6192                 status = ocfs2_truncate_log_append(osb, handle,
6193                                                    start_blk, clusters);
6194                 ocfs2_commit_trans(osb, handle);
6195                 if (status < 0) {
6196                         mlog_errno(status);
6197                         goto bail_up;
6198                 }
6199         }
6200
6201 bail_up:
6202         mutex_unlock(&tl_inode->i_mutex);
6203
6204         mlog_exit(status);
6205         return status;
6206 }
6207
6208 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
6209 {
6210         int status;
6211         struct inode *tl_inode = osb->osb_tl_inode;
6212
6213         if (tl_inode) {
6214                 cancel_delayed_work(&osb->osb_truncate_log_wq);
6215                 flush_workqueue(ocfs2_wq);
6216
6217                 status = ocfs2_flush_truncate_log(osb);
6218                 if (status < 0)
6219                         mlog_errno(status);
6220
6221                 brelse(osb->osb_tl_bh);
6222                 iput(osb->osb_tl_inode);
6223         }
6224
6225         mlog_exit_void();
6226 }
6227
6228 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6229 {
6230         int status;
6231         struct inode *tl_inode = NULL;
6232         struct buffer_head *tl_bh = NULL;
6233
6234         status = ocfs2_get_truncate_log_info(osb,
6235                                              osb->slot_num,
6236                                              &tl_inode,
6237                                              &tl_bh);
6238         if (status < 0)
6239                 mlog_errno(status);
6240
6241         /* ocfs2_truncate_log_shutdown keys on the existence of
6242          * osb->osb_tl_inode so we don't set any of the osb variables
6243          * until we're sure all is well. */
6244         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
6245                           ocfs2_truncate_log_worker);
6246         osb->osb_tl_bh    = tl_bh;
6247         osb->osb_tl_inode = tl_inode;
6248
6249         mlog_exit(status);
6250         return status;
6251 }
6252
6253 /*
6254  * Delayed de-allocation of suballocator blocks.
6255  *
6256  * Some sets of block de-allocations might involve multiple suballocator inodes.
6257  *
6258  * The locking for this can get extremely complicated, especially when
6259  * the suballocator inodes to delete from aren't known until deep
6260  * within an unrelated codepath.
6261  *
6262  * ocfs2_extent_block structures are a good example of this - an inode
6263  * btree could have been grown by any number of nodes each allocating
6264  * out of their own suballoc inode.
6265  *
6266  * These structures allow the delay of block de-allocation until a
6267  * later time, when locking of multiple cluster inodes won't cause
6268  * deadlock.
6269  */
6270
6271 /*
6272  * Describe a single bit freed from a suballocator.  For the block
6273  * suballocators, it represents one block.  For the global cluster
6274  * allocator, it represents some clusters and free_bit indicates
6275  * clusters number.
6276  */
6277 struct ocfs2_cached_block_free {
6278         struct ocfs2_cached_block_free          *free_next;
6279         u64                                     free_bg;
6280         u64                                     free_blk;
6281         unsigned int                            free_bit;
6282 };
6283
6284 struct ocfs2_per_slot_free_list {
6285         struct ocfs2_per_slot_free_list         *f_next_suballocator;
6286         int                                     f_inode_type;
6287         int                                     f_slot;
6288         struct ocfs2_cached_block_free          *f_first;
6289 };
6290
6291 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6292                                     int sysfile_type,
6293                                     int slot,
6294                                     struct ocfs2_cached_block_free *head)
6295 {
6296         int ret;
6297         u64 bg_blkno;
6298         handle_t *handle;
6299         struct inode *inode;
6300         struct buffer_head *di_bh = NULL;
6301         struct ocfs2_cached_block_free *tmp;
6302
6303         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
6304         if (!inode) {
6305                 ret = -EINVAL;
6306                 mlog_errno(ret);
6307                 goto out;
6308         }
6309
6310         mutex_lock(&inode->i_mutex);
6311
6312         ret = ocfs2_inode_lock(inode, &di_bh, 1);
6313         if (ret) {
6314                 mlog_errno(ret);
6315                 goto out_mutex;
6316         }
6317
6318         handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
6319         if (IS_ERR(handle)) {
6320                 ret = PTR_ERR(handle);
6321                 mlog_errno(ret);
6322                 goto out_unlock;
6323         }
6324
6325         while (head) {
6326                 if (head->free_bg)
6327                         bg_blkno = head->free_bg;
6328                 else
6329                         bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6330                                                               head->free_bit);
6331                 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
6332                      head->free_bit, (unsigned long long)head->free_blk);
6333
6334                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
6335                                                head->free_bit, bg_blkno, 1);
6336                 if (ret) {
6337                         mlog_errno(ret);
6338                         goto out_journal;
6339                 }
6340
6341                 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
6342                 if (ret) {
6343                         mlog_errno(ret);
6344                         goto out_journal;
6345                 }
6346
6347                 tmp = head;
6348                 head = head->free_next;
6349                 kfree(tmp);
6350         }
6351
6352 out_journal:
6353         ocfs2_commit_trans(osb, handle);
6354
6355 out_unlock:
6356         ocfs2_inode_unlock(inode, 1);
6357         brelse(di_bh);
6358 out_mutex:
6359         mutex_unlock(&inode->i_mutex);
6360         iput(inode);
6361 out:
6362         while(head) {
6363                 /* Premature exit may have left some dangling items. */
6364                 tmp = head;
6365                 head = head->free_next;
6366                 kfree(tmp);
6367         }
6368
6369         return ret;
6370 }
6371
6372 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6373                                 u64 blkno, unsigned int bit)
6374 {
6375         int ret = 0;
6376         struct ocfs2_cached_block_free *item;
6377
6378         item = kzalloc(sizeof(*item), GFP_NOFS);
6379         if (item == NULL) {
6380                 ret = -ENOMEM;
6381                 mlog_errno(ret);
6382                 return ret;
6383         }
6384
6385         mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
6386              bit, (unsigned long long)blkno);
6387
6388         item->free_blk = blkno;
6389         item->free_bit = bit;
6390         item->free_next = ctxt->c_global_allocator;
6391
6392         ctxt->c_global_allocator = item;
6393         return ret;
6394 }
6395
6396 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6397                                       struct ocfs2_cached_block_free *head)
6398 {
6399         struct ocfs2_cached_block_free *tmp;
6400         struct inode *tl_inode = osb->osb_tl_inode;
6401         handle_t *handle;
6402         int ret = 0;
6403
6404         mutex_lock(&tl_inode->i_mutex);
6405
6406         while (head) {
6407                 if (ocfs2_truncate_log_needs_flush(osb)) {
6408                         ret = __ocfs2_flush_truncate_log(osb);
6409                         if (ret < 0) {
6410                                 mlog_errno(ret);
6411                                 break;
6412                         }
6413                 }
6414
6415                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6416                 if (IS_ERR(handle)) {
6417                         ret = PTR_ERR(handle);
6418                         mlog_errno(ret);
6419                         break;
6420                 }
6421
6422                 ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6423                                                 head->free_bit);
6424
6425                 ocfs2_commit_trans(osb, handle);
6426                 tmp = head;
6427                 head = head->free_next;
6428                 kfree(tmp);
6429
6430                 if (ret < 0) {
6431                         mlog_errno(ret);
6432                         break;
6433                 }
6434         }
6435
6436         mutex_unlock(&tl_inode->i_mutex);
6437
6438         while (head) {
6439                 /* Premature exit may have left some dangling items. */
6440                 tmp = head;
6441                 head = head->free_next;
6442                 kfree(tmp);
6443         }
6444
6445         return ret;
6446 }
6447
6448 int ocfs2_run_deallocs(struct ocfs2_super *osb,
6449                        struct ocfs2_cached_dealloc_ctxt *ctxt)
6450 {
6451         int ret = 0, ret2;
6452         struct ocfs2_per_slot_free_list *fl;
6453
6454         if (!ctxt)
6455                 return 0;
6456
6457         while (ctxt->c_first_suballocator) {
6458                 fl = ctxt->c_first_suballocator;
6459
6460                 if (fl->f_first) {
6461                         mlog(0, "Free items: (type %u, slot %d)\n",
6462                              fl->f_inode_type, fl->f_slot);
6463                         ret2 = ocfs2_free_cached_blocks(osb,
6464                                                         fl->f_inode_type,
6465                                                         fl->f_slot,
6466                                                         fl->f_first);
6467                         if (ret2)
6468                                 mlog_errno(ret2);
6469                         if (!ret)
6470                                 ret = ret2;
6471                 }
6472
6473                 ctxt->c_first_suballocator = fl->f_next_suballocator;
6474                 kfree(fl);
6475         }
6476
6477         if (ctxt->c_global_allocator) {
6478                 ret2 = ocfs2_free_cached_clusters(osb,
6479                                                   ctxt->c_global_allocator);
6480                 if (ret2)
6481                         mlog_errno(ret2);
6482                 if (!ret)
6483                         ret = ret2;
6484
6485                 ctxt->c_global_allocator = NULL;
6486         }
6487
6488         return ret;
6489 }
6490
6491 static struct ocfs2_per_slot_free_list *
6492 ocfs2_find_per_slot_free_list(int type,
6493                               int slot,
6494                               struct ocfs2_cached_dealloc_ctxt *ctxt)
6495 {
6496         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6497
6498         while (fl) {
6499                 if (fl->f_inode_type == type && fl->f_slot == slot)
6500                         return fl;
6501
6502                 fl = fl->f_next_suballocator;
6503         }
6504
6505         fl = kmalloc(sizeof(*fl), GFP_NOFS);
6506         if (fl) {
6507                 fl->f_inode_type = type;
6508                 fl->f_slot = slot;
6509                 fl->f_first = NULL;
6510                 fl->f_next_suballocator = ctxt->c_first_suballocator;
6511
6512                 ctxt->c_first_suballocator = fl;
6513         }
6514         return fl;
6515 }
6516
6517 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6518                               int type, int slot, u64 suballoc,
6519                               u64 blkno, unsigned int bit)
6520 {
6521         int ret;
6522         struct ocfs2_per_slot_free_list *fl;
6523         struct ocfs2_cached_block_free *item;
6524
6525         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6526         if (fl == NULL) {
6527                 ret = -ENOMEM;
6528                 mlog_errno(ret);
6529                 goto out;
6530         }
6531
6532         item = kzalloc(sizeof(*item), GFP_NOFS);
6533         if (item == NULL) {
6534                 ret = -ENOMEM;
6535                 mlog_errno(ret);
6536                 goto out;
6537         }
6538
6539         mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6540              type, slot, bit, (unsigned long long)blkno);
6541
6542         item->free_bg = suballoc;
6543         item->free_blk = blkno;
6544         item->free_bit = bit;
6545         item->free_next = fl->f_first;
6546
6547         fl->f_first = item;
6548
6549         ret = 0;
6550 out:
6551         return ret;
6552 }
6553
6554 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6555                                          struct ocfs2_extent_block *eb)
6556 {
6557         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6558                                          le16_to_cpu(eb->h_suballoc_slot),
6559                                          le64_to_cpu(eb->h_suballoc_loc),
6560                                          le64_to_cpu(eb->h_blkno),
6561                                          le16_to_cpu(eb->h_suballoc_bit));
6562 }
6563
6564 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6565 {
6566         set_buffer_uptodate(bh);
6567         mark_buffer_dirty(bh);
6568         return 0;
6569 }
6570
6571 void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6572                               unsigned int from, unsigned int to,
6573                               struct page *page, int zero, u64 *phys)
6574 {
6575         int ret, partial = 0;
6576
6577         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6578         if (ret)
6579                 mlog_errno(ret);
6580
6581         if (zero)
6582                 zero_user_segment(page, from, to);
6583
6584         /*
6585          * Need to set the buffers we zero'd into uptodate
6586          * here if they aren't - ocfs2_map_page_blocks()
6587          * might've skipped some
6588          */
6589         ret = walk_page_buffers(handle, page_buffers(page),
6590                                 from, to, &partial,
6591                                 ocfs2_zero_func);
6592         if (ret < 0)
6593                 mlog_errno(ret);
6594         else if (ocfs2_should_order_data(inode)) {
6595                 ret = ocfs2_jbd2_file_inode(handle, inode);
6596                 if (ret < 0)
6597                         mlog_errno(ret);
6598         }
6599
6600         if (!partial)
6601                 SetPageUptodate(page);
6602
6603         flush_dcache_page(page);
6604 }
6605
6606 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6607                                      loff_t end, struct page **pages,
6608                                      int numpages, u64 phys, handle_t *handle)
6609 {
6610         int i;
6611         struct page *page;
6612         unsigned int from, to = PAGE_CACHE_SIZE;
6613         struct super_block *sb = inode->i_sb;
6614
6615         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6616
6617         if (numpages == 0)
6618                 goto out;
6619
6620         to = PAGE_CACHE_SIZE;
6621         for(i = 0; i < numpages; i++) {
6622                 page = pages[i];
6623
6624                 from = start & (PAGE_CACHE_SIZE - 1);
6625                 if ((end >> PAGE_CACHE_SHIFT) == page->index)
6626                         to = end & (PAGE_CACHE_SIZE - 1);
6627
6628                 BUG_ON(from > PAGE_CACHE_SIZE);
6629                 BUG_ON(to > PAGE_CACHE_SIZE);
6630
6631                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6632                                          &phys);
6633
6634                 start = (page->index + 1) << PAGE_CACHE_SHIFT;
6635         }
6636 out:
6637         if (pages)
6638                 ocfs2_unlock_and_free_pages(pages, numpages);
6639 }
6640
6641 int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
6642                      struct page **pages, int *num)
6643 {
6644         int numpages, ret = 0;
6645         struct address_space *mapping = inode->i_mapping;
6646         unsigned long index;
6647         loff_t last_page_bytes;
6648
6649         BUG_ON(start > end);
6650
6651         numpages = 0;
6652         last_page_bytes = PAGE_ALIGN(end);
6653         index = start >> PAGE_CACHE_SHIFT;
6654         do {
6655                 pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
6656                 if (!pages[numpages]) {
6657                         ret = -ENOMEM;
6658                         mlog_errno(ret);
6659                         goto out;
6660                 }
6661
6662                 numpages++;
6663                 index++;
6664         } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6665
6666 out:
6667         if (ret != 0) {
6668                 if (pages)
6669                         ocfs2_unlock_and_free_pages(pages, numpages);
6670                 numpages = 0;
6671         }
6672
6673         *num = numpages;
6674
6675         return ret;
6676 }
6677
6678 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6679                                 struct page **pages, int *num)
6680 {
6681         struct super_block *sb = inode->i_sb;
6682
6683         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6684                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6685
6686         return ocfs2_grab_pages(inode, start, end, pages, num);
6687 }
6688
6689 /*
6690  * Zero the area past i_size but still within an allocated
6691  * cluster. This avoids exposing nonzero data on subsequent file
6692  * extends.
6693  *
6694  * We need to call this before i_size is updated on the inode because
6695  * otherwise block_write_full_page() will skip writeout of pages past
6696  * i_size. The new_i_size parameter is passed for this reason.
6697  */
6698 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6699                                   u64 range_start, u64 range_end)
6700 {
6701         int ret = 0, numpages;
6702         struct page **pages = NULL;
6703         u64 phys;
6704         unsigned int ext_flags;
6705         struct super_block *sb = inode->i_sb;
6706
6707         /*
6708          * File systems which don't support sparse files zero on every
6709          * extend.
6710          */
6711         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6712                 return 0;
6713
6714         pages = kcalloc(ocfs2_pages_per_cluster(sb),
6715                         sizeof(struct page *), GFP_NOFS);
6716         if (pages == NULL) {
6717                 ret = -ENOMEM;
6718                 mlog_errno(ret);
6719                 goto out;
6720         }
6721
6722         if (range_start == range_end)
6723                 goto out;
6724
6725         ret = ocfs2_extent_map_get_blocks(inode,
6726                                           range_start >> sb->s_blocksize_bits,
6727                                           &phys, NULL, &ext_flags);
6728         if (ret) {
6729                 mlog_errno(ret);
6730                 goto out;
6731         }
6732
6733         /*
6734          * Tail is a hole, or is marked unwritten. In either case, we
6735          * can count on read and write to return/push zero's.
6736          */
6737         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6738                 goto out;
6739
6740         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6741                                    &numpages);
6742         if (ret) {
6743                 mlog_errno(ret);
6744                 goto out;
6745         }
6746
6747         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6748                                  numpages, phys, handle);
6749
6750         /*
6751          * Initiate writeout of the pages we zero'd here. We don't
6752          * wait on them - the truncate_inode_pages() call later will
6753          * do that for us.
6754          */
6755         ret = filemap_fdatawrite_range(inode->i_mapping, range_start,
6756                                        range_end - 1);
6757         if (ret)
6758                 mlog_errno(ret);
6759
6760 out:
6761         if (pages)
6762                 kfree(pages);
6763
6764         return ret;
6765 }
6766
6767 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6768                                              struct ocfs2_dinode *di)
6769 {
6770         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6771         unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6772
6773         if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6774                 memset(&di->id2, 0, blocksize -
6775                                     offsetof(struct ocfs2_dinode, id2) -
6776                                     xattrsize);
6777         else
6778                 memset(&di->id2, 0, blocksize -
6779                                     offsetof(struct ocfs2_dinode, id2));
6780 }
6781
6782 void ocfs2_dinode_new_extent_list(struct inode *inode,
6783                                   struct ocfs2_dinode *di)
6784 {
6785         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6786         di->id2.i_list.l_tree_depth = 0;
6787         di->id2.i_list.l_next_free_rec = 0;
6788         di->id2.i_list.l_count = cpu_to_le16(
6789                 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6790 }
6791
6792 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6793 {
6794         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6795         struct ocfs2_inline_data *idata = &di->id2.i_data;
6796
6797         spin_lock(&oi->ip_lock);
6798         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6799         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6800         spin_unlock(&oi->ip_lock);
6801
6802         /*
6803          * We clear the entire i_data structure here so that all
6804          * fields can be properly initialized.
6805          */
6806         ocfs2_zero_dinode_id2_with_xattr(inode, di);
6807
6808         idata->id_count = cpu_to_le16(
6809                         ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6810 }
6811
6812 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6813                                          struct buffer_head *di_bh)
6814 {
6815         int ret, i, has_data, num_pages = 0;
6816         handle_t *handle;
6817         u64 uninitialized_var(block);
6818         struct ocfs2_inode_info *oi = OCFS2_I(inode);
6819         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6820         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6821         struct ocfs2_alloc_context *data_ac = NULL;
6822         struct page **pages = NULL;
6823         loff_t end = osb->s_clustersize;
6824         struct ocfs2_extent_tree et;
6825         int did_quota = 0;
6826
6827         has_data = i_size_read(inode) ? 1 : 0;
6828
6829         if (has_data) {
6830                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6831                                 sizeof(struct page *), GFP_NOFS);
6832                 if (pages == NULL) {
6833                         ret = -ENOMEM;
6834                         mlog_errno(ret);
6835                         goto out;
6836                 }
6837
6838                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6839                 if (ret) {
6840                         mlog_errno(ret);
6841                         goto out;
6842                 }
6843         }
6844
6845         handle = ocfs2_start_trans(osb,
6846                                    ocfs2_inline_to_extents_credits(osb->sb));
6847         if (IS_ERR(handle)) {
6848                 ret = PTR_ERR(handle);
6849                 mlog_errno(ret);
6850                 goto out_unlock;
6851         }
6852
6853         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
6854                                       OCFS2_JOURNAL_ACCESS_WRITE);
6855         if (ret) {
6856                 mlog_errno(ret);
6857                 goto out_commit;
6858         }
6859
6860         if (has_data) {
6861                 u32 bit_off, num;
6862                 unsigned int page_end;
6863                 u64 phys;
6864
6865                 ret = dquot_alloc_space_nodirty(inode,
6866                                        ocfs2_clusters_to_bytes(osb->sb, 1));
6867                 if (ret)
6868                         goto out_commit;
6869                 did_quota = 1;
6870
6871                 data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
6872
6873                 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
6874                                            &num);
6875                 if (ret) {
6876                         mlog_errno(ret);
6877                         goto out_commit;
6878                 }
6879
6880                 /*
6881                  * Save two copies, one for insert, and one that can
6882                  * be changed by ocfs2_map_and_dirty_page() below.
6883                  */
6884                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6885
6886                 /*
6887                  * Non sparse file systems zero on extend, so no need
6888                  * to do that now.
6889                  */
6890                 if (!ocfs2_sparse_alloc(osb) &&
6891                     PAGE_CACHE_SIZE < osb->s_clustersize)
6892                         end = PAGE_CACHE_SIZE;
6893
6894                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6895                 if (ret) {
6896                         mlog_errno(ret);
6897                         goto out_commit;
6898                 }
6899
6900                 /*
6901                  * This should populate the 1st page for us and mark
6902                  * it up to date.
6903                  */
6904                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6905                 if (ret) {
6906                         mlog_errno(ret);
6907                         goto out_commit;
6908                 }
6909
6910                 page_end = PAGE_CACHE_SIZE;
6911                 if (PAGE_CACHE_SIZE > osb->s_clustersize)
6912                         page_end = osb->s_clustersize;
6913
6914                 for (i = 0; i < num_pages; i++)
6915                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6916                                                  pages[i], i > 0, &phys);
6917         }
6918
6919         spin_lock(&oi->ip_lock);
6920         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6921         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6922         spin_unlock(&oi->ip_lock);
6923
6924         ocfs2_dinode_new_extent_list(inode, di);
6925
6926         ocfs2_journal_dirty(handle, di_bh);
6927
6928         if (has_data) {
6929                 /*
6930                  * An error at this point should be extremely rare. If
6931                  * this proves to be false, we could always re-build
6932                  * the in-inode data from our pages.
6933                  */
6934                 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
6935                 ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
6936                 if (ret) {
6937                         mlog_errno(ret);
6938                         goto out_commit;
6939                 }
6940
6941                 inode->i_blocks = ocfs2_inode_sector_count(inode);
6942         }
6943
6944 out_commit:
6945         if (ret < 0 && did_quota)
6946                 dquot_free_space_nodirty(inode,
6947                                           ocfs2_clusters_to_bytes(osb->sb, 1));
6948
6949         ocfs2_commit_trans(osb, handle);
6950
6951 out_unlock:
6952         if (data_ac)
6953                 ocfs2_free_alloc_context(data_ac);
6954
6955 out:
6956         if (pages) {
6957                 ocfs2_unlock_and_free_pages(pages, num_pages);
6958                 kfree(pages);
6959         }
6960
6961         return ret;
6962 }
6963
6964 /*
6965  * It is expected, that by the time you call this function,
6966  * inode->i_size and fe->i_size have been adjusted.
6967  *
6968  * WARNING: This will kfree the truncate context
6969  */
6970 int ocfs2_commit_truncate(struct ocfs2_super *osb,
6971                           struct inode *inode,
6972                           struct buffer_head *di_bh)
6973 {
6974         int status = 0, i, flags = 0;
6975         u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
6976         u64 blkno = 0;
6977         struct ocfs2_extent_list *el;
6978         struct ocfs2_extent_rec *rec;
6979         struct ocfs2_path *path = NULL;
6980         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6981         struct ocfs2_extent_list *root_el = &(di->id2.i_list);
6982         u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
6983         struct ocfs2_extent_tree et;
6984         struct ocfs2_cached_dealloc_ctxt dealloc;
6985
6986         ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
6987         ocfs2_init_dealloc_ctxt(&dealloc);
6988
6989         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6990                                                      i_size_read(inode));
6991
6992         path = ocfs2_new_path(di_bh, &di->id2.i_list,
6993                               ocfs2_journal_access_di);
6994         if (!path) {
6995                 status = -ENOMEM;
6996                 mlog_errno(status);
6997                 goto bail;
6998         }
6999
7000         ocfs2_extent_map_trunc(inode, new_highest_cpos);
7001
7002 start:
7003         /*
7004          * Check that we still have allocation to delete.
7005          */
7006         if (OCFS2_I(inode)->ip_clusters == 0) {
7007                 status = 0;
7008                 goto bail;
7009         }
7010
7011         /*
7012          * Truncate always works against the rightmost tree branch.
7013          */
7014         status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
7015         if (status) {
7016                 mlog_errno(status);
7017                 goto bail;
7018         }
7019
7020         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
7021              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
7022
7023         /*
7024          * By now, el will point to the extent list on the bottom most
7025          * portion of this tree. Only the tail record is considered in
7026          * each pass.
7027          *
7028          * We handle the following cases, in order:
7029          * - empty extent: delete the remaining branch
7030          * - remove the entire record
7031          * - remove a partial record
7032          * - no record needs to be removed (truncate has completed)
7033          */
7034         el = path_leaf_el(path);
7035         if (le16_to_cpu(el->l_next_free_rec) == 0) {
7036                 ocfs2_error(inode->i_sb,
7037                             "Inode %llu has empty extent block at %llu\n",
7038                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7039                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
7040                 status = -EROFS;
7041                 goto bail;
7042         }
7043
7044         i = le16_to_cpu(el->l_next_free_rec) - 1;
7045         rec = &el->l_recs[i];
7046         flags = rec->e_flags;
7047         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
7048
7049         if (i == 0 && ocfs2_is_empty_extent(rec)) {
7050                 /*
7051                  * Lower levels depend on this never happening, but it's best
7052                  * to check it up here before changing the tree.
7053                 */
7054                 if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
7055                         ocfs2_error(inode->i_sb, "Inode %lu has an empty "
7056                                     "extent record, depth %u\n", inode->i_ino,
7057                                     le16_to_cpu(root_el->l_tree_depth));
7058                         status = -EROFS;
7059                         goto bail;
7060                 }
7061                 trunc_cpos = le32_to_cpu(rec->e_cpos);
7062                 trunc_len = 0;
7063                 blkno = 0;
7064         } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
7065                 /*
7066                  * Truncate entire record.
7067                  */
7068                 trunc_cpos = le32_to_cpu(rec->e_cpos);
7069                 trunc_len = ocfs2_rec_clusters(el, rec);
7070                 blkno = le64_to_cpu(rec->e_blkno);
7071         } else if (range > new_highest_cpos) {
7072                 /*
7073                  * Partial truncate. it also should be
7074                  * the last truncate we're doing.
7075                  */
7076                 trunc_cpos = new_highest_cpos;
7077                 trunc_len = range - new_highest_cpos;
7078                 coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
7079                 blkno = le64_to_cpu(rec->e_blkno) +
7080                                 ocfs2_clusters_to_blocks(inode->i_sb, coff);
7081         } else {
7082                 /*
7083                  * Truncate completed, leave happily.
7084                  */
7085                 status = 0;
7086                 goto bail;
7087         }
7088
7089         phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
7090
7091         status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
7092                                           phys_cpos, trunc_len, flags, &dealloc,
7093                                           refcount_loc);
7094         if (status < 0) {
7095                 mlog_errno(status);
7096                 goto bail;
7097         }
7098
7099         ocfs2_reinit_path(path, 1);
7100
7101         /*
7102          * The check above will catch the case where we've truncated
7103          * away all allocation.
7104          */
7105         goto start;
7106
7107 bail:
7108
7109         ocfs2_schedule_truncate_log_flush(osb, 1);
7110
7111         ocfs2_run_deallocs(osb, &dealloc);
7112
7113         ocfs2_free_path(path);
7114
7115         mlog_exit(status);
7116         return status;
7117 }
7118
7119 /*
7120  * 'start' is inclusive, 'end' is not.
7121  */
7122 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7123                           unsigned int start, unsigned int end, int trunc)
7124 {
7125         int ret;
7126         unsigned int numbytes;
7127         handle_t *handle;
7128         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7129         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7130         struct ocfs2_inline_data *idata = &di->id2.i_data;
7131
7132         if (end > i_size_read(inode))
7133                 end = i_size_read(inode);
7134
7135         BUG_ON(start >= end);
7136
7137         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7138             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7139             !ocfs2_supports_inline_data(osb)) {
7140                 ocfs2_error(inode->i_sb,
7141                             "Inline data flags for inode %llu don't agree! "
7142                             "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7143                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7144                             le16_to_cpu(di->i_dyn_features),
7145                             OCFS2_I(inode)->ip_dyn_features,
7146                             osb->s_feature_incompat);
7147                 ret = -EROFS;
7148                 goto out;
7149         }
7150
7151         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7152         if (IS_ERR(handle)) {
7153                 ret = PTR_ERR(handle);
7154                 mlog_errno(ret);
7155                 goto out;
7156         }
7157
7158         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7159                                       OCFS2_JOURNAL_ACCESS_WRITE);
7160         if (ret) {
7161                 mlog_errno(ret);
7162                 goto out_commit;
7163         }
7164
7165         numbytes = end - start;
7166         memset(idata->id_data + start, 0, numbytes);
7167
7168         /*
7169          * No need to worry about the data page here - it's been
7170          * truncated already and inline data doesn't need it for
7171          * pushing zero's to disk, so we'll let readpage pick it up
7172          * later.
7173          */
7174         if (trunc) {
7175                 i_size_write(inode, start);
7176                 di->i_size = cpu_to_le64(start);
7177         }
7178
7179         inode->i_blocks = ocfs2_inode_sector_count(inode);
7180         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7181
7182         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7183         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7184
7185         ocfs2_journal_dirty(handle, di_bh);
7186
7187 out_commit:
7188         ocfs2_commit_trans(osb, handle);
7189
7190 out:
7191         return ret;
7192 }