]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/md/persistent-data/dm-btree.c
Merge remote-tracking branch 'tmem/linux-next'
[karo-tx-linux.git] / drivers / md / persistent-data / dm-btree.c
1 /*
2  * Copyright (C) 2011 Red Hat, Inc. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6
7 #include "dm-btree-internal.h"
8 #include "dm-space-map.h"
9 #include "dm-transaction-manager.h"
10
11 #include <linux/module.h>
12 #include <linux/device-mapper.h>
13
14 #define DM_MSG_PREFIX "btree"
15
16 /*----------------------------------------------------------------
17  * Array manipulation
18  *--------------------------------------------------------------*/
19 static void memcpy_disk(void *dest, const void *src, size_t len)
20         __dm_written_to_disk(src)
21 {
22         memcpy(dest, src, len);
23         __dm_unbless_for_disk(src);
24 }
25
26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27                          unsigned index, void *elt)
28         __dm_written_to_disk(elt)
29 {
30         if (index < nr_elts)
31                 memmove(base + (elt_size * (index + 1)),
32                         base + (elt_size * index),
33                         (nr_elts - index) * elt_size);
34
35         memcpy_disk(base + (elt_size * index), elt, elt_size);
36 }
37
38 /*----------------------------------------------------------------*/
39
40 /* makes the assumption that no two keys are the same. */
41 static int bsearch(struct node *n, uint64_t key, int want_hi)
42 {
43         int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44
45         while (hi - lo > 1) {
46                 int mid = lo + ((hi - lo) / 2);
47                 uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48
49                 if (mid_key == key)
50                         return mid;
51
52                 if (mid_key < key)
53                         lo = mid;
54                 else
55                         hi = mid;
56         }
57
58         return want_hi ? hi : lo;
59 }
60
61 int lower_bound(struct node *n, uint64_t key)
62 {
63         return bsearch(n, key, 0);
64 }
65
66 void inc_children(struct dm_transaction_manager *tm, struct node *n,
67                   struct dm_btree_value_type *vt)
68 {
69         unsigned i;
70         uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
71
72         if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
73                 for (i = 0; i < nr_entries; i++)
74                         dm_tm_inc(tm, value64(n, i));
75         else if (vt->inc)
76                 for (i = 0; i < nr_entries; i++)
77                         vt->inc(vt->context,
78                                 value_ptr(n, i, vt->size));
79 }
80
81 static int insert_at(size_t value_size, struct node *node, unsigned index,
82                       uint64_t key, void *value)
83                       __dm_written_to_disk(value)
84 {
85         uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
86         __le64 key_le = cpu_to_le64(key);
87
88         if (index > nr_entries ||
89             index >= le32_to_cpu(node->header.max_entries)) {
90                 DMERR("too many entries in btree node for insert");
91                 __dm_unbless_for_disk(value);
92                 return -ENOMEM;
93         }
94
95         __dm_bless_for_disk(&key_le);
96
97         array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
98         array_insert(value_base(node), value_size, nr_entries, index, value);
99         node->header.nr_entries = cpu_to_le32(nr_entries + 1);
100
101         return 0;
102 }
103
104 /*----------------------------------------------------------------*/
105
106 /*
107  * We want 3n entries (for some n).  This works more nicely for repeated
108  * insert remove loops than (2n + 1).
109  */
110 static uint32_t calc_max_entries(size_t value_size, size_t block_size)
111 {
112         uint32_t total, n;
113         size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
114
115         block_size -= sizeof(struct node_header);
116         total = block_size / elt_size;
117         n = total / 3;          /* rounds down */
118
119         return 3 * n;
120 }
121
122 int dm_btree_create(struct dm_btree_info *info, dm_block_t *root)
123 {
124         int r;
125         struct dm_block *b;
126         struct node *n;
127         size_t block_size;
128         uint32_t max_entries;
129
130         r = new_block(info, &b);
131         if (r < 0)
132                 return r;
133
134         block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
135         max_entries = calc_max_entries(info->value_type.size, block_size);
136
137         n = dm_block_data(b);
138         memset(n, 0, block_size);
139         n->header.flags = cpu_to_le32(LEAF_NODE);
140         n->header.nr_entries = cpu_to_le32(0);
141         n->header.max_entries = cpu_to_le32(max_entries);
142         n->header.value_size = cpu_to_le32(info->value_type.size);
143
144         *root = dm_block_location(b);
145
146         return unlock_block(info, b);
147 }
148 EXPORT_SYMBOL_GPL(dm_btree_create);
149
150 /*----------------------------------------------------------------*/
151
152 /*
153  * Deletion uses a recursive algorithm, since we have limited stack space
154  * we explicitly manage our own stack on the heap.
155  */
156 #define MAX_SPINE_DEPTH 64
157 struct frame {
158         struct dm_block *b;
159         struct node *n;
160         unsigned level;
161         unsigned nr_children;
162         unsigned current_child;
163 };
164
165 struct del_stack {
166         struct dm_transaction_manager *tm;
167         int top;
168         struct frame spine[MAX_SPINE_DEPTH];
169 };
170
171 static int top_frame(struct del_stack *s, struct frame **f)
172 {
173         if (s->top < 0) {
174                 DMERR("btree deletion stack empty");
175                 return -EINVAL;
176         }
177
178         *f = s->spine + s->top;
179
180         return 0;
181 }
182
183 static int unprocessed_frames(struct del_stack *s)
184 {
185         return s->top >= 0;
186 }
187
188 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
189 {
190         int r;
191         uint32_t ref_count;
192
193         if (s->top >= MAX_SPINE_DEPTH - 1) {
194                 DMERR("btree deletion stack out of memory");
195                 return -ENOMEM;
196         }
197
198         r = dm_tm_ref(s->tm, b, &ref_count);
199         if (r)
200                 return r;
201
202         if (ref_count > 1)
203                 /*
204                  * This is a shared node, so we can just decrement its
205                  * reference counter and leave the children.
206                  */
207                 dm_tm_dec(s->tm, b);
208
209         else {
210                 struct frame *f = s->spine + ++s->top;
211
212                 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
213                 if (r) {
214                         s->top--;
215                         return r;
216                 }
217
218                 f->n = dm_block_data(f->b);
219                 f->level = level;
220                 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
221                 f->current_child = 0;
222         }
223
224         return 0;
225 }
226
227 static void pop_frame(struct del_stack *s)
228 {
229         struct frame *f = s->spine + s->top--;
230
231         dm_tm_dec(s->tm, dm_block_location(f->b));
232         dm_tm_unlock(s->tm, f->b);
233 }
234
235 int dm_btree_destroy(struct dm_btree_info *info, dm_block_t root)
236 {
237         int r;
238         struct del_stack *s;
239
240         s = kmalloc(sizeof(*s), GFP_KERNEL);
241         if (!s)
242                 return -ENOMEM;
243
244         s->tm = info->tm;
245         s->top = -1;
246
247         r = push_frame(s, root, 1);
248         if (r)
249                 goto out;
250
251         while (unprocessed_frames(s)) {
252                 uint32_t flags;
253                 struct frame *f;
254                 dm_block_t b;
255
256                 r = top_frame(s, &f);
257                 if (r)
258                         goto out;
259
260                 if (f->current_child >= f->nr_children) {
261                         pop_frame(s);
262                         continue;
263                 }
264
265                 flags = le32_to_cpu(f->n->header.flags);
266                 if (flags & INTERNAL_NODE) {
267                         b = value64(f->n, f->current_child);
268                         f->current_child++;
269                         r = push_frame(s, b, f->level);
270                         if (r)
271                                 goto out;
272
273                 } else if (f->level != (info->levels - 1)) {
274                         b = value64(f->n, f->current_child);
275                         f->current_child++;
276                         r = push_frame(s, b, f->level + 1);
277                         if (r)
278                                 goto out;
279
280                 } else {
281                         if (info->value_type.dec) {
282                                 unsigned i;
283
284                                 for (i = 0; i < f->nr_children; i++)
285                                         info->value_type.dec(info->value_type.context,
286                                                              value_ptr(f->n, i, info->value_type.size));
287                         }
288                         f->current_child = f->nr_children;
289                 }
290         }
291
292 out:
293         kfree(s);
294         return r;
295 }
296 EXPORT_SYMBOL_GPL(dm_btree_destroy);
297
298 // FIXME Implement or remove this fn before final submission.
299 int dm_btree_delete_gt(struct dm_btree_info *info, dm_block_t root, uint64_t *key,
300                     dm_block_t *new_root)
301 {
302         /* FIXME: implement */
303         return 0;
304 }
305 EXPORT_SYMBOL_GPL(dm_btree_delete_gt);
306
307 /*----------------------------------------------------------------*/
308
309 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
310                             int (*search_fn)(struct node *, uint64_t),
311                             uint64_t *result_key, void *v, size_t value_size)
312 {
313         int i, r;
314         uint32_t flags, nr_entries;
315
316         do {
317                 r = ro_step(s, block);
318                 if (r < 0)
319                         return r;
320
321                 i = search_fn(ro_node(s), key);
322
323                 flags = le32_to_cpu(ro_node(s)->header.flags);
324                 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
325                 if (i < 0 || i >= nr_entries)
326                         return -ENODATA;
327
328                 if (flags & INTERNAL_NODE)
329                         block = value64(ro_node(s), i);
330
331         } while (!(flags & LEAF_NODE));
332
333         *result_key = le64_to_cpu(ro_node(s)->keys[i]);
334         memcpy(v, value_ptr(ro_node(s), i, value_size), value_size);
335
336         return 0;
337 }
338
339 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
340                     uint64_t *keys, void *value_le)
341 {
342         unsigned level, last_level = info->levels - 1;
343         int r = -ENODATA;
344         uint64_t rkey;
345         __le64 internal_value_le;
346         struct ro_spine spine;
347
348         init_ro_spine(&spine, info);
349         for (level = 0; level < info->levels; level++) {
350                 size_t size;
351                 void *value_p;
352
353                 if (level == last_level) {
354                         value_p = value_le;
355                         size = info->value_type.size;
356
357                 } else {
358                         value_p = &internal_value_le;
359                         size = sizeof(uint64_t);
360                 }
361
362                 r = btree_lookup_raw(&spine, root, keys[level],
363                                      lower_bound, &rkey,
364                                      value_p, size);
365
366                 if (!r) {
367                         if (rkey != keys[level]) {
368                                 exit_ro_spine(&spine);
369                                 return -ENODATA;
370                         }
371                 } else {
372                         exit_ro_spine(&spine);
373                         return r;
374                 }
375
376                 root = le64_to_cpu(internal_value_le);
377         }
378         exit_ro_spine(&spine);
379
380         return r;
381 }
382 EXPORT_SYMBOL_GPL(dm_btree_lookup);
383
384 /*
385  * Splits a node by creating a sibling node and shifting half the nodes
386  * contents across.  Assumes there is a parent node, and it has room for
387  * another child.
388  *
389  * Before:
390  *        +--------+
391  *        | Parent |
392  *        +--------+
393  *           |
394  *           v
395  *      +----------+
396  *      | A ++++++ |
397  *      +----------+
398  *
399  *
400  * After:
401  *              +--------+
402  *              | Parent |
403  *              +--------+
404  *                |     |
405  *                v     +------+
406  *          +---------+        |
407  *          | A* +++  |        v
408  *          +---------+   +-------+
409  *                        | B +++ |
410  *                        +-------+
411  *
412  * Where A* is a shadow of A.
413  */
414 static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
415                                unsigned parent_index, uint64_t key)
416 {
417         int r;
418         size_t size;
419         unsigned nr_left, nr_right;
420         struct dm_block *left, *right, *parent;
421         struct node *ln, *rn, *pn;
422         __le64 location;
423
424         left = shadow_current(s);
425
426         r = new_block(s->info, &right);
427         if (r < 0)
428                 return r;
429
430         ln = dm_block_data(left);
431         rn = dm_block_data(right);
432
433         nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
434         nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
435
436         ln->header.nr_entries = cpu_to_le32(nr_left);
437
438         rn->header.flags = ln->header.flags;
439         rn->header.nr_entries = cpu_to_le32(nr_right);
440         rn->header.max_entries = ln->header.max_entries;
441         rn->header.value_size = ln->header.value_size;
442         memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
443
444         size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
445                 sizeof(uint64_t) : s->info->value_type.size;
446         memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size),
447                size * nr_right);
448
449         /*
450          * Patch up the parent
451          */
452         parent = shadow_parent(s);
453
454         pn = dm_block_data(parent);
455         location = cpu_to_le64(dm_block_location(left));
456         __dm_bless_for_disk(&location);
457         memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)),
458                     &location, sizeof(__le64));
459
460         location = cpu_to_le64(dm_block_location(right));
461         __dm_bless_for_disk(&location);
462
463         r = insert_at(sizeof(__le64), pn, parent_index + 1,
464                       le64_to_cpu(rn->keys[0]), &location);
465         if (r)
466                 return r;
467
468         if (key < le64_to_cpu(rn->keys[0])) {
469                 unlock_block(s->info, right);
470                 s->nodes[1] = left;
471         } else {
472                 unlock_block(s->info, left);
473                 s->nodes[1] = right;
474         }
475
476         return 0;
477 }
478
479 /*
480  * Splits a node by creating two new children beneath the given node.
481  *
482  * Before:
483  *        +----------+
484  *        | A ++++++ |
485  *        +----------+
486  *
487  *
488  * After:
489  *      +------------+
490  *      | A (shadow) |
491  *      +------------+
492  *          |   |
493  *   +------+   +----+
494  *   |               |
495  *   v               v
496  * +-------+     +-------+
497  * | B +++ |     | C +++ |
498  * +-------+     +-------+
499  */
500 static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
501 {
502         int r;
503         size_t size;
504         unsigned nr_left, nr_right;
505         struct dm_block *left, *right, *new_parent;
506         struct node *pn, *ln, *rn;
507         __le64 val;
508
509         new_parent = shadow_current(s);
510
511         r = new_block(s->info, &left);
512         if (r < 0)
513                 return r;
514
515         r = new_block(s->info, &right);
516         if (r < 0) {
517                 /* FIXME: put left */
518                 return r;
519         }
520
521         pn = dm_block_data(new_parent);
522         ln = dm_block_data(left);
523         rn = dm_block_data(right);
524
525         nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
526         nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
527
528         ln->header.flags = pn->header.flags;
529         ln->header.nr_entries = cpu_to_le32(nr_left);
530         ln->header.max_entries = pn->header.max_entries;
531         ln->header.value_size = pn->header.value_size;
532
533         rn->header.flags = pn->header.flags;
534         rn->header.nr_entries = cpu_to_le32(nr_right);
535         rn->header.max_entries = pn->header.max_entries;
536         rn->header.value_size = pn->header.value_size;
537
538         memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
539         memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
540
541         size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
542                 sizeof(__le64) : s->info->value_type.size;
543         memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size);
544         memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size),
545                nr_right * size);
546
547         /* new_parent should just point to l and r now */
548         pn->header.flags = cpu_to_le32(INTERNAL_NODE);
549         pn->header.nr_entries = cpu_to_le32(2);
550         pn->header.max_entries = cpu_to_le32(
551                 calc_max_entries(sizeof(__le64),
552                                  dm_bm_block_size(
553                                          dm_tm_get_bm(s->info->tm))));
554         pn->header.value_size = cpu_to_le32(sizeof(__le64));
555
556         val = cpu_to_le64(dm_block_location(left));
557         __dm_bless_for_disk(&val);
558         pn->keys[0] = ln->keys[0];
559         memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64));
560
561         val = cpu_to_le64(dm_block_location(right));
562         __dm_bless_for_disk(&val);
563         pn->keys[1] = rn->keys[0];
564         memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64));
565
566         /*
567          * rejig the spine.  This is ugly, since it knows too
568          * much about the spine
569          */
570         if (s->nodes[0] != new_parent) {
571                 unlock_block(s->info, s->nodes[0]);
572                 s->nodes[0] = new_parent;
573         }
574         if (key < le64_to_cpu(rn->keys[0])) {
575                 unlock_block(s->info, right);
576                 s->nodes[1] = left;
577         } else {
578                 unlock_block(s->info, left);
579                 s->nodes[1] = right;
580         }
581         s->count = 2;
582
583         return 0;
584 }
585
586 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
587                             struct dm_btree_value_type *vt,
588                             uint64_t key, unsigned *index)
589 {
590         int r, i = *index, inc, top = 1;
591         struct node *node;
592
593         for (;;) {
594                 r = shadow_step(s, root, vt, &inc);
595                 if (r < 0)
596                         return r;
597
598                 node = dm_block_data(shadow_current(s));
599                 if (inc)
600                         inc_children(s->info->tm, node, vt);
601
602                 /*
603                  * We have to patch up the parent node, ugly, but I don't
604                  * see a way to do this automatically as part of the spine
605                  * op.
606                  */
607                 if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
608                         __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
609
610                         __dm_bless_for_disk(&location);
611                         memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)),
612                                     &location, sizeof(__le64));
613                 }
614
615                 node = dm_block_data(shadow_current(s));
616
617                 if (node->header.nr_entries == node->header.max_entries) {
618                         if (top)
619                                 r = btree_split_beneath(s, key);
620                         else
621                                 r = btree_split_sibling(s, root, i, key);
622
623                         if (r < 0)
624                                 return r;
625                 }
626
627                 node = dm_block_data(shadow_current(s));
628
629                 i = lower_bound(node, key);
630
631                 if (le32_to_cpu(node->header.flags) & LEAF_NODE)
632                         break;
633
634                 if (i < 0) {
635                         /* change the bounds on the lowest key */
636                         node->keys[0] = cpu_to_le64(key);
637                         i = 0;
638                 }
639
640                 root = value64(node, i);
641                 top = 0;
642         }
643
644         if (i < 0 || le64_to_cpu(node->keys[i]) != key)
645                 i++;
646
647         /* we're about to overwrite this value, so undo the increment for it */
648         /* FIXME: shame that inc information is leaking outside the spine.
649          * Plus inc is just plain wrong in the event of a split */
650         if (le64_to_cpu(node->keys[i]) == key && inc)
651                 if (vt->dec)
652                         vt->dec(vt->context, value_ptr(node, i, vt->size));
653
654         *index = i;
655         return 0;
656 }
657
658 static int insert(struct dm_btree_info *info, dm_block_t root,
659                   uint64_t *keys, void *value, dm_block_t *new_root,
660                   int *inserted)
661                   __dm_written_to_disk(value)
662 {
663         int r, need_insert;
664         unsigned level, index = -1, last_level = info->levels - 1;
665         dm_block_t block = root;
666         struct shadow_spine spine;
667         struct node *n;
668         struct dm_btree_value_type le64_type;
669
670         le64_type.context = NULL;
671         le64_type.size = sizeof(__le64);
672         le64_type.inc = NULL;
673         le64_type.dec = NULL;
674         le64_type.equal = NULL;
675
676         init_shadow_spine(&spine, info);
677
678         for (level = 0; level < (info->levels - 1); level++) {
679                 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
680                 if (r < 0)
681                         goto bad;
682
683                 n = dm_block_data(shadow_current(&spine));
684                 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
685                                (le64_to_cpu(n->keys[index]) != keys[level]));
686
687                 if (need_insert) {
688                         dm_block_t new_tree;
689                         __le64 new_le;
690
691                         r = dm_btree_create(info, &new_tree);
692                         if (r < 0)
693                                 goto bad;
694
695                         new_le = cpu_to_le64(new_tree);
696                         __dm_bless_for_disk(&new_le);
697
698                         r = insert_at(sizeof(uint64_t), n, index,
699                                       keys[level], &new_le);
700                         if (r)
701                                 goto bad;
702                 }
703
704                 if (level < last_level)
705                         block = value64(n, index);
706         }
707
708         r = btree_insert_raw(&spine, block, &info->value_type,
709                              keys[level], &index);
710         if (r < 0)
711                 goto bad;
712
713         n = dm_block_data(shadow_current(&spine));
714         need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
715                        (le64_to_cpu(n->keys[index]) != keys[level]));
716
717         if (need_insert) {
718                 if (inserted)
719                         *inserted = 1;
720
721                 r = insert_at(info->value_type.size, n, index,
722                               keys[level], value);
723                 if (r)
724                         goto bad_unblessed;
725         } else {
726                 if (inserted)
727                         *inserted = 0;
728
729                 if (info->value_type.dec &&
730                     (!info->value_type.equal ||
731                      !info->value_type.equal(
732                              info->value_type.context,
733                              value_ptr(n, index, info->value_type.size),
734                              value))) {
735                         info->value_type.dec(info->value_type.context,
736                                              value_ptr(n, index, info->value_type.size));
737                 }
738                 memcpy_disk(value_ptr(n, index, info->value_type.size),
739                             value, info->value_type.size);
740         }
741
742         *new_root = shadow_root(&spine);
743         exit_shadow_spine(&spine);
744
745         return 0;
746
747 bad:
748         __dm_unbless_for_disk(value);
749 bad_unblessed:
750         exit_shadow_spine(&spine);
751         return r;
752 }
753
754 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
755                     uint64_t *keys, void *value, dm_block_t *new_root)
756                     __dm_written_to_disk(value)
757 {
758         return insert(info, root, keys, value, new_root, NULL);
759 }
760 EXPORT_SYMBOL_GPL(dm_btree_insert);
761
762 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
763                            uint64_t *keys, void *value, dm_block_t *new_root,
764                            int *inserted)
765                            __dm_written_to_disk(value)
766 {
767         return insert(info, root, keys, value, new_root, inserted);
768 }
769 EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
770
771 /*----------------------------------------------------------------*/
772
773 int dm_btree_clone(struct dm_btree_info *info, dm_block_t root,
774                    dm_block_t *clone)
775 {
776         int r;
777         struct dm_block *b, *orig_b;
778         struct node *b_node, *orig_node;
779
780         /* Copy the root node */
781         r = new_block(info, &b);
782         if (r < 0)
783                 return r;
784
785         r = dm_tm_read_lock(info->tm, root, &btree_node_validator, &orig_b);
786         if (r < 0) {
787                 dm_block_t location = dm_block_location(b);
788
789                 unlock_block(info, b);
790                 dm_tm_dec(info->tm, location);
791         }
792
793         *clone = dm_block_location(b);
794         b_node = dm_block_data(b);
795         orig_node = dm_block_data(orig_b);
796
797         memcpy(b_node, orig_node,
798                dm_bm_block_size(dm_tm_get_bm(info->tm)));
799         dm_tm_unlock(info->tm, orig_b);
800         inc_children(info->tm, b_node, &info->value_type);
801         dm_tm_unlock(info->tm, b);
802
803         return 0;
804 }
805 EXPORT_SYMBOL_GPL(dm_btree_clone);
806
807 /*----------------------------------------------------------------*/
808
809 static int find_highest_key(struct ro_spine *s, dm_block_t block,
810                             uint64_t *result_key, dm_block_t *next_block)
811 {
812         int i, r;
813         uint32_t flags;
814
815         do {
816                 r = ro_step(s, block);
817                 if (r < 0)
818                         return r;
819
820                 flags = le32_to_cpu(ro_node(s)->header.flags);
821                 i = le32_to_cpu(ro_node(s)->header.nr_entries);
822                 if (!i)
823                         return -ENODATA;
824                 else
825                         i--;
826
827                 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
828                 if (next_block || flags & INTERNAL_NODE)
829                         block = value64(ro_node(s), i);
830
831         } while (flags & INTERNAL_NODE);
832
833         if (next_block)
834                 *next_block = block;
835         return 0;
836 }
837
838 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
839                               uint64_t *result_keys)
840 {
841         int r = 0, count = 0, level;
842         struct ro_spine spine;
843
844         init_ro_spine(&spine, info);
845         for (level = 0; level < info->levels; level++) {
846                 r = find_highest_key(&spine, root, result_keys + level,
847                                      level == info->levels - 1 ? NULL : &root);
848                 if (r == -ENODATA) {
849                         r = 0;
850                         break;
851
852                 } else if (r)
853                         break;
854
855                 count++;
856         }
857         exit_ro_spine(&spine);
858
859         return r ? r : count;
860 }
861 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);