2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
15 * This work is based on the LPC-trie which is originally descibed in:
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
28 * Code from fib_hash has been reused which includes the following header:
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
35 * IPv4 FIB: lookup engine and maintenance routines.
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
45 * Substantial contributions to this work comes from:
47 * David S. Miller, <davem@davemloft.net>
48 * Stephen Hemminger <shemminger@osdl.org>
49 * Paul E. McKenney <paulmck@us.ibm.com>
50 * Patrick McHardy <kaber@trash.net>
53 #define VERSION "0.408"
55 #include <asm/uaccess.h>
56 #include <asm/system.h>
57 #include <linux/bitops.h>
58 #include <linux/types.h>
59 #include <linux/kernel.h>
61 #include <linux/string.h>
62 #include <linux/socket.h>
63 #include <linux/sockios.h>
64 #include <linux/errno.h>
66 #include <linux/inet.h>
67 #include <linux/inetdevice.h>
68 #include <linux/netdevice.h>
69 #include <linux/if_arp.h>
70 #include <linux/proc_fs.h>
71 #include <linux/rcupdate.h>
72 #include <linux/skbuff.h>
73 #include <linux/netlink.h>
74 #include <linux/init.h>
75 #include <linux/list.h>
76 #include <net/net_namespace.h>
78 #include <net/protocol.h>
79 #include <net/route.h>
82 #include <net/ip_fib.h>
83 #include "fib_lookup.h"
85 #undef CONFIG_IP_FIB_TRIE_STATS
86 #define MAX_STAT_DEPTH 32
88 #define KEYLENGTH (8*sizeof(t_key))
90 typedef unsigned int t_key;
94 #define NODE_TYPE_MASK 0x1UL
95 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
97 #define IS_TNODE(n) (!(n->parent & T_LEAF))
98 #define IS_LEAF(n) (n->parent & T_LEAF)
102 unsigned long parent;
107 unsigned long parent;
108 struct hlist_head list;
113 struct hlist_node hlist;
116 struct list_head falh;
121 unsigned long parent;
122 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
123 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
124 unsigned short full_children; /* KEYLENGTH bits needed */
125 unsigned short empty_children; /* KEYLENGTH bits needed */
127 struct node *child[0];
130 #ifdef CONFIG_IP_FIB_TRIE_STATS
131 struct trie_use_stats {
133 unsigned int backtrack;
134 unsigned int semantic_match_passed;
135 unsigned int semantic_match_miss;
136 unsigned int null_node_hit;
137 unsigned int resize_node_skipped;
142 unsigned int totdepth;
143 unsigned int maxdepth;
146 unsigned int nullpointers;
147 unsigned int nodesizes[MAX_STAT_DEPTH];
152 #ifdef CONFIG_IP_FIB_TRIE_STATS
153 struct trie_use_stats stats;
158 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
159 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
160 static struct node *resize(struct trie *t, struct tnode *tn);
161 static struct tnode *inflate(struct trie *t, struct tnode *tn);
162 static struct tnode *halve(struct trie *t, struct tnode *tn);
163 static void tnode_free(struct tnode *tn);
165 static struct kmem_cache *fn_alias_kmem __read_mostly;
167 static inline struct tnode *node_parent(struct node *node)
171 ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
172 return rcu_dereference(ret);
175 static inline void node_set_parent(struct node *node, struct tnode *ptr)
177 rcu_assign_pointer(node->parent,
178 (unsigned long)ptr | NODE_TYPE(node));
181 /* rcu_read_lock needs to be hold by caller from readside */
183 static inline struct node *tnode_get_child(struct tnode *tn, int i)
185 BUG_ON(i >= 1 << tn->bits);
187 return rcu_dereference(tn->child[i]);
190 static inline int tnode_child_length(const struct tnode *tn)
192 return 1 << tn->bits;
195 static inline t_key mask_pfx(t_key k, unsigned short l)
197 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
200 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
202 if (offset < KEYLENGTH)
203 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
208 static inline int tkey_equals(t_key a, t_key b)
213 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
215 if (bits == 0 || offset >= KEYLENGTH)
217 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
218 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
221 static inline int tkey_mismatch(t_key a, int offset, t_key b)
228 while ((diff << i) >> (KEYLENGTH-1) == 0)
234 To understand this stuff, an understanding of keys and all their bits is
235 necessary. Every node in the trie has a key associated with it, but not
236 all of the bits in that key are significant.
238 Consider a node 'n' and its parent 'tp'.
240 If n is a leaf, every bit in its key is significant. Its presence is
241 necessitated by path compression, since during a tree traversal (when
242 searching for a leaf - unless we are doing an insertion) we will completely
243 ignore all skipped bits we encounter. Thus we need to verify, at the end of
244 a potentially successful search, that we have indeed been walking the
247 Note that we can never "miss" the correct key in the tree if present by
248 following the wrong path. Path compression ensures that segments of the key
249 that are the same for all keys with a given prefix are skipped, but the
250 skipped part *is* identical for each node in the subtrie below the skipped
251 bit! trie_insert() in this implementation takes care of that - note the
252 call to tkey_sub_equals() in trie_insert().
254 if n is an internal node - a 'tnode' here, the various parts of its key
255 have many different meanings.
258 _________________________________________________________________
259 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
260 -----------------------------------------------------------------
261 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
263 _________________________________________________________________
264 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
265 -----------------------------------------------------------------
266 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
273 First, let's just ignore the bits that come before the parent tp, that is
274 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
275 not use them for anything.
277 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
278 index into the parent's child array. That is, they will be used to find
279 'n' among tp's children.
281 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
284 All the bits we have seen so far are significant to the node n. The rest
285 of the bits are really not needed or indeed known in n->key.
287 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
288 n's child array, and will of course be different for each child.
291 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
296 static inline void check_tnode(const struct tnode *tn)
298 WARN_ON(tn && tn->pos+tn->bits > 32);
301 static const int halve_threshold = 25;
302 static const int inflate_threshold = 50;
303 static const int halve_threshold_root = 8;
304 static const int inflate_threshold_root = 15;
307 static void __alias_free_mem(struct rcu_head *head)
309 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
310 kmem_cache_free(fn_alias_kmem, fa);
313 static inline void alias_free_mem_rcu(struct fib_alias *fa)
315 call_rcu(&fa->rcu, __alias_free_mem);
318 static void __leaf_free_rcu(struct rcu_head *head)
320 kfree(container_of(head, struct leaf, rcu));
323 static void __leaf_info_free_rcu(struct rcu_head *head)
325 kfree(container_of(head, struct leaf_info, rcu));
328 static inline void free_leaf_info(struct leaf_info *leaf)
330 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
333 static struct tnode *tnode_alloc(unsigned int size)
337 if (size <= PAGE_SIZE)
338 return kcalloc(size, 1, GFP_KERNEL);
340 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
344 return page_address(pages);
347 static void __tnode_free_rcu(struct rcu_head *head)
349 struct tnode *tn = container_of(head, struct tnode, rcu);
350 unsigned int size = sizeof(struct tnode) +
351 (1 << tn->bits) * sizeof(struct node *);
353 if (size <= PAGE_SIZE)
356 free_pages((unsigned long)tn, get_order(size));
359 static inline void tnode_free(struct tnode *tn)
362 struct leaf *l = (struct leaf *) tn;
363 call_rcu_bh(&l->rcu, __leaf_free_rcu);
365 call_rcu(&tn->rcu, __tnode_free_rcu);
368 static struct leaf *leaf_new(void)
370 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
373 INIT_HLIST_HEAD(&l->list);
378 static struct leaf_info *leaf_info_new(int plen)
380 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
383 INIT_LIST_HEAD(&li->falh);
388 static struct tnode* tnode_new(t_key key, int pos, int bits)
390 int nchildren = 1<<bits;
391 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
392 struct tnode *tn = tnode_alloc(sz);
396 tn->parent = T_TNODE;
400 tn->full_children = 0;
401 tn->empty_children = 1<<bits;
404 pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
405 (unsigned int) (sizeof(struct node) * 1<<bits));
410 * Check whether a tnode 'n' is "full", i.e. it is an internal node
411 * and no bits are skipped. See discussion in dyntree paper p. 6
414 static inline int tnode_full(const struct tnode *tn, const struct node *n)
416 if (n == NULL || IS_LEAF(n))
419 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
422 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
424 tnode_put_child_reorg(tn, i, n, -1);
428 * Add a child at position i overwriting the old value.
429 * Update the value of full_children and empty_children.
432 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
434 struct node *chi = tn->child[i];
437 BUG_ON(i >= 1<<tn->bits);
440 /* update emptyChildren */
441 if (n == NULL && chi != NULL)
442 tn->empty_children++;
443 else if (n != NULL && chi == NULL)
444 tn->empty_children--;
446 /* update fullChildren */
448 wasfull = tnode_full(tn, chi);
450 isfull = tnode_full(tn, n);
451 if (wasfull && !isfull)
453 else if (!wasfull && isfull)
457 node_set_parent(n, tn);
459 rcu_assign_pointer(tn->child[i], n);
462 static struct node *resize(struct trie *t, struct tnode *tn)
466 struct tnode *old_tn;
467 int inflate_threshold_use;
468 int halve_threshold_use;
474 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
475 tn, inflate_threshold, halve_threshold);
478 if (tn->empty_children == tnode_child_length(tn)) {
483 if (tn->empty_children == tnode_child_length(tn) - 1)
484 for (i = 0; i < tnode_child_length(tn); i++) {
491 /* compress one level */
492 node_set_parent(n, NULL);
497 * Double as long as the resulting node has a number of
498 * nonempty nodes that are above the threshold.
502 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
503 * the Helsinki University of Technology and Matti Tikkanen of Nokia
504 * Telecommunications, page 6:
505 * "A node is doubled if the ratio of non-empty children to all
506 * children in the *doubled* node is at least 'high'."
508 * 'high' in this instance is the variable 'inflate_threshold'. It
509 * is expressed as a percentage, so we multiply it with
510 * tnode_child_length() and instead of multiplying by 2 (since the
511 * child array will be doubled by inflate()) and multiplying
512 * the left-hand side by 100 (to handle the percentage thing) we
513 * multiply the left-hand side by 50.
515 * The left-hand side may look a bit weird: tnode_child_length(tn)
516 * - tn->empty_children is of course the number of non-null children
517 * in the current node. tn->full_children is the number of "full"
518 * children, that is non-null tnodes with a skip value of 0.
519 * All of those will be doubled in the resulting inflated tnode, so
520 * we just count them one extra time here.
522 * A clearer way to write this would be:
524 * to_be_doubled = tn->full_children;
525 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
528 * new_child_length = tnode_child_length(tn) * 2;
530 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
532 * if (new_fill_factor >= inflate_threshold)
534 * ...and so on, tho it would mess up the while () loop.
537 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
541 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
542 * inflate_threshold * new_child_length
544 * expand not_to_be_doubled and to_be_doubled, and shorten:
545 * 100 * (tnode_child_length(tn) - tn->empty_children +
546 * tn->full_children) >= inflate_threshold * new_child_length
548 * expand new_child_length:
549 * 100 * (tnode_child_length(tn) - tn->empty_children +
550 * tn->full_children) >=
551 * inflate_threshold * tnode_child_length(tn) * 2
554 * 50 * (tn->full_children + tnode_child_length(tn) -
555 * tn->empty_children) >= inflate_threshold *
556 * tnode_child_length(tn)
562 /* Keep root node larger */
565 inflate_threshold_use = inflate_threshold_root;
567 inflate_threshold_use = inflate_threshold;
571 while ((tn->full_children > 0 && max_resize-- &&
572 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
573 inflate_threshold_use * tnode_child_length(tn))) {
579 #ifdef CONFIG_IP_FIB_TRIE_STATS
580 t->stats.resize_node_skipped++;
586 if (max_resize < 0) {
588 printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
589 inflate_threshold_root, tn->bits);
591 printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
592 inflate_threshold, tn->bits);
598 * Halve as long as the number of empty children in this
599 * node is above threshold.
603 /* Keep root node larger */
606 halve_threshold_use = halve_threshold_root;
608 halve_threshold_use = halve_threshold;
612 while (tn->bits > 1 && max_resize-- &&
613 100 * (tnode_child_length(tn) - tn->empty_children) <
614 halve_threshold_use * tnode_child_length(tn)) {
620 #ifdef CONFIG_IP_FIB_TRIE_STATS
621 t->stats.resize_node_skipped++;
627 if (max_resize < 0) {
629 printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
630 halve_threshold_root, tn->bits);
632 printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
633 halve_threshold, tn->bits);
636 /* Only one child remains */
637 if (tn->empty_children == tnode_child_length(tn) - 1)
638 for (i = 0; i < tnode_child_length(tn); i++) {
645 /* compress one level */
647 node_set_parent(n, NULL);
652 return (struct node *) tn;
655 static struct tnode *inflate(struct trie *t, struct tnode *tn)
658 struct tnode *oldtnode = tn;
659 int olen = tnode_child_length(tn);
662 pr_debug("In inflate\n");
664 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
667 return ERR_PTR(-ENOMEM);
670 * Preallocate and store tnodes before the actual work so we
671 * don't get into an inconsistent state if memory allocation
672 * fails. In case of failure we return the oldnode and inflate
673 * of tnode is ignored.
676 for (i = 0; i < olen; i++) {
677 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
681 inode->pos == oldtnode->pos + oldtnode->bits &&
683 struct tnode *left, *right;
684 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
686 left = tnode_new(inode->key&(~m), inode->pos + 1,
691 right = tnode_new(inode->key|m, inode->pos + 1,
699 put_child(t, tn, 2*i, (struct node *) left);
700 put_child(t, tn, 2*i+1, (struct node *) right);
704 for (i = 0; i < olen; i++) {
705 struct node *node = tnode_get_child(oldtnode, i);
706 struct tnode *left, *right;
713 /* A leaf or an internal node with skipped bits */
715 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
716 tn->pos + tn->bits - 1) {
717 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
719 put_child(t, tn, 2*i, node);
721 put_child(t, tn, 2*i+1, node);
725 /* An internal node with two children */
726 inode = (struct tnode *) node;
728 if (inode->bits == 1) {
729 put_child(t, tn, 2*i, inode->child[0]);
730 put_child(t, tn, 2*i+1, inode->child[1]);
736 /* An internal node with more than two children */
738 /* We will replace this node 'inode' with two new
739 * ones, 'left' and 'right', each with half of the
740 * original children. The two new nodes will have
741 * a position one bit further down the key and this
742 * means that the "significant" part of their keys
743 * (see the discussion near the top of this file)
744 * will differ by one bit, which will be "0" in
745 * left's key and "1" in right's key. Since we are
746 * moving the key position by one step, the bit that
747 * we are moving away from - the bit at position
748 * (inode->pos) - is the one that will differ between
749 * left and right. So... we synthesize that bit in the
751 * The mask 'm' below will be a single "one" bit at
752 * the position (inode->pos)
755 /* Use the old key, but set the new significant
759 left = (struct tnode *) tnode_get_child(tn, 2*i);
760 put_child(t, tn, 2*i, NULL);
764 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
765 put_child(t, tn, 2*i+1, NULL);
769 size = tnode_child_length(left);
770 for (j = 0; j < size; j++) {
771 put_child(t, left, j, inode->child[j]);
772 put_child(t, right, j, inode->child[j + size]);
774 put_child(t, tn, 2*i, resize(t, left));
775 put_child(t, tn, 2*i+1, resize(t, right));
779 tnode_free(oldtnode);
783 int size = tnode_child_length(tn);
786 for (j = 0; j < size; j++)
788 tnode_free((struct tnode *)tn->child[j]);
792 return ERR_PTR(-ENOMEM);
796 static struct tnode *halve(struct trie *t, struct tnode *tn)
798 struct tnode *oldtnode = tn;
799 struct node *left, *right;
801 int olen = tnode_child_length(tn);
803 pr_debug("In halve\n");
805 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
808 return ERR_PTR(-ENOMEM);
811 * Preallocate and store tnodes before the actual work so we
812 * don't get into an inconsistent state if memory allocation
813 * fails. In case of failure we return the oldnode and halve
814 * of tnode is ignored.
817 for (i = 0; i < olen; i += 2) {
818 left = tnode_get_child(oldtnode, i);
819 right = tnode_get_child(oldtnode, i+1);
821 /* Two nonempty children */
825 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
830 put_child(t, tn, i/2, (struct node *)newn);
835 for (i = 0; i < olen; i += 2) {
836 struct tnode *newBinNode;
838 left = tnode_get_child(oldtnode, i);
839 right = tnode_get_child(oldtnode, i+1);
841 /* At least one of the children is empty */
843 if (right == NULL) /* Both are empty */
845 put_child(t, tn, i/2, right);
850 put_child(t, tn, i/2, left);
854 /* Two nonempty children */
855 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
856 put_child(t, tn, i/2, NULL);
857 put_child(t, newBinNode, 0, left);
858 put_child(t, newBinNode, 1, right);
859 put_child(t, tn, i/2, resize(t, newBinNode));
861 tnode_free(oldtnode);
865 int size = tnode_child_length(tn);
868 for (j = 0; j < size; j++)
870 tnode_free((struct tnode *)tn->child[j]);
874 return ERR_PTR(-ENOMEM);
878 /* readside must use rcu_read_lock currently dump routines
879 via get_fa_head and dump */
881 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
883 struct hlist_head *head = &l->list;
884 struct hlist_node *node;
885 struct leaf_info *li;
887 hlist_for_each_entry_rcu(li, node, head, hlist)
888 if (li->plen == plen)
894 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
896 struct leaf_info *li = find_leaf_info(l, plen);
904 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
906 struct leaf_info *li = NULL, *last = NULL;
907 struct hlist_node *node;
909 if (hlist_empty(head)) {
910 hlist_add_head_rcu(&new->hlist, head);
912 hlist_for_each_entry(li, node, head, hlist) {
913 if (new->plen > li->plen)
919 hlist_add_after_rcu(&last->hlist, &new->hlist);
921 hlist_add_before_rcu(&new->hlist, &li->hlist);
925 /* rcu_read_lock needs to be hold by caller from readside */
928 fib_find_node(struct trie *t, u32 key)
935 n = rcu_dereference(t->trie);
937 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
938 tn = (struct tnode *) n;
942 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
943 pos = tn->pos + tn->bits;
944 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
948 /* Case we have found a leaf. Compare prefixes */
950 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
951 return (struct leaf *)n;
956 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
959 t_key cindex, key = tn->key;
962 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
963 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
964 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
965 tn = (struct tnode *) resize (t, (struct tnode *)tn);
966 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
968 tp = node_parent((struct node *) tn);
974 /* Handle last (top) tnode */
976 tn = (struct tnode*) resize(t, (struct tnode *)tn);
978 return (struct node*) tn;
981 /* only used from updater-side */
983 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
986 struct tnode *tp = NULL, *tn = NULL;
990 struct list_head *fa_head = NULL;
991 struct leaf_info *li;
997 /* If we point to NULL, stop. Either the tree is empty and we should
998 * just put a new leaf in if, or we have reached an empty child slot,
999 * and we should just put our new leaf in that.
1000 * If we point to a T_TNODE, check if it matches our key. Note that
1001 * a T_TNODE might be skipping any number of bits - its 'pos' need
1002 * not be the parent's 'pos'+'bits'!
1004 * If it does match the current key, get pos/bits from it, extract
1005 * the index from our key, push the T_TNODE and walk the tree.
1007 * If it doesn't, we have to replace it with a new T_TNODE.
1009 * If we point to a T_LEAF, it might or might not have the same key
1010 * as we do. If it does, just change the value, update the T_LEAF's
1011 * value, and return it.
1012 * If it doesn't, we need to replace it with a T_TNODE.
1015 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1016 tn = (struct tnode *) n;
1020 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1022 pos = tn->pos + tn->bits;
1023 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1025 BUG_ON(n && node_parent(n) != tn);
1031 * n ----> NULL, LEAF or TNODE
1033 * tp is n's (parent) ----> NULL or TNODE
1036 BUG_ON(tp && IS_LEAF(tp));
1038 /* Case 1: n is a leaf. Compare prefixes */
1040 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1041 struct leaf *l = (struct leaf *) n;
1043 li = leaf_info_new(plen);
1048 fa_head = &li->falh;
1049 insert_leaf_info(&l->list, li);
1059 li = leaf_info_new(plen);
1062 tnode_free((struct tnode *) l);
1066 fa_head = &li->falh;
1067 insert_leaf_info(&l->list, li);
1069 if (t->trie && n == NULL) {
1070 /* Case 2: n is NULL, and will just insert a new leaf */
1072 node_set_parent((struct node *)l, tp);
1074 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1075 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1077 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1079 * Add a new tnode here
1080 * first tnode need some special handling
1084 pos = tp->pos+tp->bits;
1089 newpos = tkey_mismatch(key, pos, n->key);
1090 tn = tnode_new(n->key, newpos, 1);
1093 tn = tnode_new(key, newpos, 1); /* First tnode */
1098 tnode_free((struct tnode *) l);
1102 node_set_parent((struct node *)tn, tp);
1104 missbit = tkey_extract_bits(key, newpos, 1);
1105 put_child(t, tn, missbit, (struct node *)l);
1106 put_child(t, tn, 1-missbit, n);
1109 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1110 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1112 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1117 if (tp && tp->pos + tp->bits > 32)
1118 printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1119 tp, tp->pos, tp->bits, key, plen);
1121 /* Rebalance the trie */
1123 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1129 * Caller must hold RTNL.
1131 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1133 struct trie *t = (struct trie *) tb->tb_data;
1134 struct fib_alias *fa, *new_fa;
1135 struct list_head *fa_head = NULL;
1136 struct fib_info *fi;
1137 int plen = cfg->fc_dst_len;
1138 u8 tos = cfg->fc_tos;
1146 key = ntohl(cfg->fc_dst);
1148 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1150 mask = ntohl(inet_make_mask(plen));
1157 fi = fib_create_info(cfg);
1163 l = fib_find_node(t, key);
1167 fa_head = get_fa_head(l, plen);
1168 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1171 /* Now fa, if non-NULL, points to the first fib alias
1172 * with the same keys [prefix,tos,priority], if such key already
1173 * exists or to the node before which we will insert new one.
1175 * If fa is NULL, we will need to allocate a new one and
1176 * insert to the head of f.
1178 * If f is NULL, no fib node matched the destination key
1179 * and we need to allocate a new one of those as well.
1182 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1183 struct fib_alias *fa_orig;
1186 if (cfg->fc_nlflags & NLM_F_EXCL)
1189 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1190 struct fib_info *fi_drop;
1193 if (fi->fib_treeref > 1)
1197 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1201 fi_drop = fa->fa_info;
1202 new_fa->fa_tos = fa->fa_tos;
1203 new_fa->fa_info = fi;
1204 new_fa->fa_type = cfg->fc_type;
1205 new_fa->fa_scope = cfg->fc_scope;
1206 state = fa->fa_state;
1207 new_fa->fa_state &= ~FA_S_ACCESSED;
1209 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1210 alias_free_mem_rcu(fa);
1212 fib_release_info(fi_drop);
1213 if (state & FA_S_ACCESSED)
1215 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1216 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1220 /* Error if we find a perfect match which
1221 * uses the same scope, type, and nexthop
1225 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1226 if (fa->fa_tos != tos)
1228 if (fa->fa_info->fib_priority != fi->fib_priority)
1230 if (fa->fa_type == cfg->fc_type &&
1231 fa->fa_scope == cfg->fc_scope &&
1232 fa->fa_info == fi) {
1236 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1240 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1244 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1248 new_fa->fa_info = fi;
1249 new_fa->fa_tos = tos;
1250 new_fa->fa_type = cfg->fc_type;
1251 new_fa->fa_scope = cfg->fc_scope;
1252 new_fa->fa_state = 0;
1254 * Insert new entry to the list.
1258 fa_head = fib_insert_node(t, key, plen);
1259 if (unlikely(!fa_head)) {
1261 goto out_free_new_fa;
1265 list_add_tail_rcu(&new_fa->fa_list,
1266 (fa ? &fa->fa_list : fa_head));
1269 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1270 &cfg->fc_nlinfo, 0);
1275 kmem_cache_free(fn_alias_kmem, new_fa);
1277 fib_release_info(fi);
1283 /* should be called with rcu_read_lock */
1284 static inline int check_leaf(struct trie *t, struct leaf *l,
1285 t_key key, int *plen, const struct flowi *flp,
1286 struct fib_result *res)
1290 struct leaf_info *li;
1291 struct hlist_head *hhead = &l->list;
1292 struct hlist_node *node;
1294 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1296 mask = inet_make_mask(i);
1297 if (l->key != (key & ntohl(mask)))
1300 if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
1302 #ifdef CONFIG_IP_FIB_TRIE_STATS
1303 t->stats.semantic_match_passed++;
1307 #ifdef CONFIG_IP_FIB_TRIE_STATS
1308 t->stats.semantic_match_miss++;
1315 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1317 struct trie *t = (struct trie *) tb->tb_data;
1322 t_key key = ntohl(flp->fl4_dst);
1325 int current_prefix_length = KEYLENGTH;
1327 t_key node_prefix, key_prefix, pref_mismatch;
1332 n = rcu_dereference(t->trie);
1336 #ifdef CONFIG_IP_FIB_TRIE_STATS
1342 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1346 pn = (struct tnode *) n;
1354 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1357 n = tnode_get_child(pn, cindex);
1360 #ifdef CONFIG_IP_FIB_TRIE_STATS
1361 t->stats.null_node_hit++;
1367 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1375 cn = (struct tnode *)n;
1378 * It's a tnode, and we can do some extra checks here if we
1379 * like, to avoid descending into a dead-end branch.
1380 * This tnode is in the parent's child array at index
1381 * key[p_pos..p_pos+p_bits] but potentially with some bits
1382 * chopped off, so in reality the index may be just a
1383 * subprefix, padded with zero at the end.
1384 * We can also take a look at any skipped bits in this
1385 * tnode - everything up to p_pos is supposed to be ok,
1386 * and the non-chopped bits of the index (se previous
1387 * paragraph) are also guaranteed ok, but the rest is
1388 * considered unknown.
1390 * The skipped bits are key[pos+bits..cn->pos].
1393 /* If current_prefix_length < pos+bits, we are already doing
1394 * actual prefix matching, which means everything from
1395 * pos+(bits-chopped_off) onward must be zero along some
1396 * branch of this subtree - otherwise there is *no* valid
1397 * prefix present. Here we can only check the skipped
1398 * bits. Remember, since we have already indexed into the
1399 * parent's child array, we know that the bits we chopped of
1403 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1405 if (current_prefix_length < pos+bits) {
1406 if (tkey_extract_bits(cn->key, current_prefix_length,
1407 cn->pos - current_prefix_length) != 0 ||
1413 * If chopped_off=0, the index is fully validated and we
1414 * only need to look at the skipped bits for this, the new,
1415 * tnode. What we actually want to do is to find out if
1416 * these skipped bits match our key perfectly, or if we will
1417 * have to count on finding a matching prefix further down,
1418 * because if we do, we would like to have some way of
1419 * verifying the existence of such a prefix at this point.
1422 /* The only thing we can do at this point is to verify that
1423 * any such matching prefix can indeed be a prefix to our
1424 * key, and if the bits in the node we are inspecting that
1425 * do not match our key are not ZERO, this cannot be true.
1426 * Thus, find out where there is a mismatch (before cn->pos)
1427 * and verify that all the mismatching bits are zero in the
1431 /* Note: We aren't very concerned about the piece of the key
1432 * that precede pn->pos+pn->bits, since these have already been
1433 * checked. The bits after cn->pos aren't checked since these are
1434 * by definition "unknown" at this point. Thus, what we want to
1435 * see is if we are about to enter the "prefix matching" state,
1436 * and in that case verify that the skipped bits that will prevail
1437 * throughout this subtree are zero, as they have to be if we are
1438 * to find a matching prefix.
1441 node_prefix = mask_pfx(cn->key, cn->pos);
1442 key_prefix = mask_pfx(key, cn->pos);
1443 pref_mismatch = key_prefix^node_prefix;
1446 /* In short: If skipped bits in this node do not match the search
1447 * key, enter the "prefix matching" state.directly.
1449 if (pref_mismatch) {
1450 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1452 pref_mismatch = pref_mismatch <<1;
1454 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1456 if (key_prefix != 0)
1459 if (current_prefix_length >= cn->pos)
1460 current_prefix_length = mp;
1463 pn = (struct tnode *)n; /* Descend */
1470 /* As zero don't change the child key (cindex) */
1471 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1474 /* Decrease current_... with bits chopped off */
1475 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1476 current_prefix_length = pn->pos + pn->bits - chopped_off;
1479 * Either we do the actual chop off according or if we have
1480 * chopped off all bits in this tnode walk up to our parent.
1483 if (chopped_off <= pn->bits) {
1484 cindex &= ~(1 << (chopped_off-1));
1486 struct tnode *parent = node_parent((struct node *) pn);
1490 /* Get Child's index */
1491 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1495 #ifdef CONFIG_IP_FIB_TRIE_STATS
1496 t->stats.backtrack++;
1508 /* only called from updater side */
1509 static int trie_leaf_remove(struct trie *t, t_key key)
1512 struct tnode *tp = NULL;
1513 struct node *n = t->trie;
1516 pr_debug("entering trie_leaf_remove(%p)\n", n);
1518 /* Note that in the case skipped bits, those bits are *not* checked!
1519 * When we finish this, we will have NULL or a T_LEAF, and the
1520 * T_LEAF may or may not match our key.
1523 while (n != NULL && IS_TNODE(n)) {
1524 struct tnode *tn = (struct tnode *) n;
1526 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1528 BUG_ON(n && node_parent(n) != tn);
1530 l = (struct leaf *) n;
1532 if (!n || !tkey_equals(l->key, key))
1537 * Remove the leaf and rebalance the tree
1542 tp = node_parent(n);
1543 tnode_free((struct tnode *) n);
1546 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1547 put_child(t, (struct tnode *)tp, cindex, NULL);
1548 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1550 rcu_assign_pointer(t->trie, NULL);
1556 * Caller must hold RTNL.
1558 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1560 struct trie *t = (struct trie *) tb->tb_data;
1562 int plen = cfg->fc_dst_len;
1563 u8 tos = cfg->fc_tos;
1564 struct fib_alias *fa, *fa_to_delete;
1565 struct list_head *fa_head;
1567 struct leaf_info *li;
1572 key = ntohl(cfg->fc_dst);
1573 mask = ntohl(inet_make_mask(plen));
1579 l = fib_find_node(t, key);
1584 fa_head = get_fa_head(l, plen);
1585 fa = fib_find_alias(fa_head, tos, 0);
1590 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1592 fa_to_delete = NULL;
1593 fa_head = fa->fa_list.prev;
1595 list_for_each_entry(fa, fa_head, fa_list) {
1596 struct fib_info *fi = fa->fa_info;
1598 if (fa->fa_tos != tos)
1601 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1602 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1603 fa->fa_scope == cfg->fc_scope) &&
1604 (!cfg->fc_protocol ||
1605 fi->fib_protocol == cfg->fc_protocol) &&
1606 fib_nh_match(cfg, fi) == 0) {
1616 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1617 &cfg->fc_nlinfo, 0);
1619 l = fib_find_node(t, key);
1620 li = find_leaf_info(l, plen);
1622 list_del_rcu(&fa->fa_list);
1624 if (list_empty(fa_head)) {
1625 hlist_del_rcu(&li->hlist);
1629 if (hlist_empty(&l->list))
1630 trie_leaf_remove(t, key);
1632 if (fa->fa_state & FA_S_ACCESSED)
1635 fib_release_info(fa->fa_info);
1636 alias_free_mem_rcu(fa);
1640 static int trie_flush_list(struct trie *t, struct list_head *head)
1642 struct fib_alias *fa, *fa_node;
1645 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1646 struct fib_info *fi = fa->fa_info;
1648 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1649 list_del_rcu(&fa->fa_list);
1650 fib_release_info(fa->fa_info);
1651 alias_free_mem_rcu(fa);
1658 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1661 struct hlist_head *lih = &l->list;
1662 struct hlist_node *node, *tmp;
1663 struct leaf_info *li = NULL;
1665 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1666 found += trie_flush_list(t, &li->falh);
1668 if (list_empty(&li->falh)) {
1669 hlist_del_rcu(&li->hlist);
1676 /* rcu_read_lock needs to be hold by caller from readside */
1678 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1680 struct node *c = (struct node *) thisleaf;
1683 struct node *trie = rcu_dereference(t->trie);
1689 if (IS_LEAF(trie)) /* trie w. just a leaf */
1690 return (struct leaf *) trie;
1692 p = (struct tnode*) trie; /* Start */
1699 /* Find the next child of the parent */
1701 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1705 last = 1 << p->bits;
1706 for (idx = pos; idx < last ; idx++) {
1707 c = rcu_dereference(p->child[idx]);
1712 /* Decend if tnode */
1713 while (IS_TNODE(c)) {
1714 p = (struct tnode *) c;
1717 /* Rightmost non-NULL branch */
1718 if (p && IS_TNODE(p))
1719 while (!(c = rcu_dereference(p->child[idx]))
1720 && idx < (1<<p->bits)) idx++;
1722 /* Done with this tnode? */
1723 if (idx >= (1 << p->bits) || !c)
1726 return (struct leaf *) c;
1729 /* No more children go up one step */
1730 c = (struct node *) p;
1733 return NULL; /* Ready. Root of trie */
1737 * Caller must hold RTNL.
1739 static int fn_trie_flush(struct fib_table *tb)
1741 struct trie *t = (struct trie *) tb->tb_data;
1742 struct leaf *ll = NULL, *l = NULL;
1745 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1746 found += trie_flush_leaf(t, l);
1748 if (ll && hlist_empty(&ll->list))
1749 trie_leaf_remove(t, ll->key);
1753 if (ll && hlist_empty(&ll->list))
1754 trie_leaf_remove(t, ll->key);
1756 pr_debug("trie_flush found=%d\n", found);
1761 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1763 struct trie *t = (struct trie *) tb->tb_data;
1764 int order, last_idx;
1765 struct fib_info *fi = NULL;
1766 struct fib_info *last_resort;
1767 struct fib_alias *fa = NULL;
1768 struct list_head *fa_head;
1777 l = fib_find_node(t, 0);
1781 fa_head = get_fa_head(l, 0);
1785 if (list_empty(fa_head))
1788 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1789 struct fib_info *next_fi = fa->fa_info;
1791 if (fa->fa_scope != res->scope ||
1792 fa->fa_type != RTN_UNICAST)
1795 if (next_fi->fib_priority > res->fi->fib_priority)
1797 if (!next_fi->fib_nh[0].nh_gw ||
1798 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1800 fa->fa_state |= FA_S_ACCESSED;
1803 if (next_fi != res->fi)
1805 } else if (!fib_detect_death(fi, order, &last_resort,
1806 &last_idx, tb->tb_default)) {
1807 fib_result_assign(res, fi);
1808 tb->tb_default = order;
1814 if (order <= 0 || fi == NULL) {
1815 tb->tb_default = -1;
1819 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1821 fib_result_assign(res, fi);
1822 tb->tb_default = order;
1826 fib_result_assign(res, last_resort);
1827 tb->tb_default = last_idx;
1832 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1833 struct sk_buff *skb, struct netlink_callback *cb)
1836 struct fib_alias *fa;
1838 __be32 xkey = htonl(key);
1843 /* rcu_read_lock is hold by caller */
1845 list_for_each_entry_rcu(fa, fah, fa_list) {
1850 BUG_ON(!fa->fa_info);
1852 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1861 fa->fa_info, 0) < 0) {
1871 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1872 struct netlink_callback *cb)
1875 struct list_head *fa_head;
1876 struct leaf *l = NULL;
1880 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1884 memset(&cb->args[4], 0,
1885 sizeof(cb->args) - 4*sizeof(cb->args[0]));
1887 fa_head = get_fa_head(l, plen);
1892 if (list_empty(fa_head))
1895 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1904 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1907 struct trie *t = (struct trie *) tb->tb_data;
1912 for (m = 0; m <= 32; m++) {
1916 memset(&cb->args[3], 0,
1917 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1919 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1932 /* Fix more generic FIB names for init later */
1934 struct fib_table *fib_hash_init(u32 id)
1936 struct fib_table *tb;
1939 if (fn_alias_kmem == NULL)
1940 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1941 sizeof(struct fib_alias),
1942 0, SLAB_HWCACHE_ALIGN,
1945 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1951 tb->tb_default = -1;
1952 tb->tb_lookup = fn_trie_lookup;
1953 tb->tb_insert = fn_trie_insert;
1954 tb->tb_delete = fn_trie_delete;
1955 tb->tb_flush = fn_trie_flush;
1956 tb->tb_select_default = fn_trie_select_default;
1957 tb->tb_dump = fn_trie_dump;
1959 t = (struct trie *) tb->tb_data;
1960 memset(t, 0, sizeof(*t));
1962 if (id == RT_TABLE_LOCAL)
1963 printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
1968 #ifdef CONFIG_PROC_FS
1969 /* Depth first Trie walk iterator */
1970 struct fib_trie_iter {
1971 struct seq_net_private p;
1972 struct trie *trie_local, *trie_main;
1973 struct tnode *tnode;
1979 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
1981 struct tnode *tn = iter->tnode;
1982 unsigned cindex = iter->index;
1985 /* A single entry routing table */
1989 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1990 iter->tnode, iter->index, iter->depth);
1992 while (cindex < (1<<tn->bits)) {
1993 struct node *n = tnode_get_child(tn, cindex);
1998 iter->index = cindex + 1;
2000 /* push down one level */
2001 iter->tnode = (struct tnode *) n;
2011 /* Current node exhausted, pop back up */
2012 p = node_parent((struct node *)tn);
2014 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2024 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2032 n = rcu_dereference(t->trie);
2039 iter->tnode = (struct tnode *) n;
2054 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2057 struct fib_trie_iter iter;
2059 memset(s, 0, sizeof(*s));
2062 for (n = fib_trie_get_first(&iter, t); n;
2063 n = fib_trie_get_next(&iter)) {
2066 s->totdepth += iter.depth;
2067 if (iter.depth > s->maxdepth)
2068 s->maxdepth = iter.depth;
2070 const struct tnode *tn = (const struct tnode *) n;
2074 if (tn->bits < MAX_STAT_DEPTH)
2075 s->nodesizes[tn->bits]++;
2077 for (i = 0; i < (1<<tn->bits); i++)
2086 * This outputs /proc/net/fib_triestats
2088 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2090 unsigned i, max, pointers, bytes, avdepth;
2093 avdepth = stat->totdepth*100 / stat->leaves;
2097 seq_printf(seq, "\tAver depth: %u.%02d\n", avdepth / 100, avdepth % 100 );
2098 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2100 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2102 bytes = sizeof(struct leaf) * stat->leaves;
2103 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2104 bytes += sizeof(struct tnode) * stat->tnodes;
2106 max = MAX_STAT_DEPTH;
2107 while (max > 0 && stat->nodesizes[max-1] == 0)
2111 for (i = 1; i <= max; i++)
2112 if (stat->nodesizes[i] != 0) {
2113 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
2114 pointers += (1<<i) * stat->nodesizes[i];
2116 seq_putc(seq, '\n');
2117 seq_printf(seq, "\tPointers: %u\n", pointers);
2119 bytes += sizeof(struct node *) * pointers;
2120 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2121 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2123 #ifdef CONFIG_IP_FIB_TRIE_STATS
2124 seq_printf(seq, "Counters:\n---------\n");
2125 seq_printf(seq,"gets = %d\n", t->stats.gets);
2126 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2127 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2128 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2129 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2130 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2132 memset(&(t->stats), 0, sizeof(t->stats));
2134 #endif /* CONFIG_IP_FIB_TRIE_STATS */
2137 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2139 struct net *net = (struct net *)seq->private;
2140 struct trie *trie_local, *trie_main;
2141 struct trie_stat *stat;
2142 struct fib_table *tb;
2145 tb = fib_get_table(net, RT_TABLE_LOCAL);
2147 trie_local = (struct trie *) tb->tb_data;
2150 tb = fib_get_table(net, RT_TABLE_MAIN);
2152 trie_main = (struct trie *) tb->tb_data;
2155 stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2159 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2160 sizeof(struct leaf), sizeof(struct tnode));
2163 seq_printf(seq, "Local:\n");
2164 trie_collect_stats(trie_local, stat);
2165 trie_show_stats(seq, stat);
2169 seq_printf(seq, "Main:\n");
2170 trie_collect_stats(trie_main, stat);
2171 trie_show_stats(seq, stat);
2178 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2183 net = get_proc_net(inode);
2186 err = single_open(file, fib_triestat_seq_show, net);
2194 static int fib_triestat_seq_release(struct inode *ino, struct file *f)
2196 struct seq_file *seq = f->private_data;
2197 put_net(seq->private);
2198 return single_release(ino, f);
2201 static const struct file_operations fib_triestat_fops = {
2202 .owner = THIS_MODULE,
2203 .open = fib_triestat_seq_open,
2205 .llseek = seq_lseek,
2206 .release = fib_triestat_seq_release,
2209 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2215 for (n = fib_trie_get_first(iter, iter->trie_local);
2216 n; ++idx, n = fib_trie_get_next(iter)) {
2221 for (n = fib_trie_get_first(iter, iter->trie_main);
2222 n; ++idx, n = fib_trie_get_next(iter)) {
2229 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2231 struct fib_trie_iter *iter = seq->private;
2232 struct fib_table *tb;
2234 if (!iter->trie_local) {
2235 tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL);
2237 iter->trie_local = (struct trie *) tb->tb_data;
2239 if (!iter->trie_main) {
2240 tb = fib_get_table(iter->p.net, RT_TABLE_MAIN);
2242 iter->trie_main = (struct trie *) tb->tb_data;
2246 return SEQ_START_TOKEN;
2247 return fib_trie_get_idx(iter, *pos - 1);
2250 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2252 struct fib_trie_iter *iter = seq->private;
2256 if (v == SEQ_START_TOKEN)
2257 return fib_trie_get_idx(iter, 0);
2259 v = fib_trie_get_next(iter);
2264 /* continue scan in next trie */
2265 if (iter->trie == iter->trie_local)
2266 return fib_trie_get_first(iter, iter->trie_main);
2271 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2276 static void seq_indent(struct seq_file *seq, int n)
2278 while (n-- > 0) seq_puts(seq, " ");
2281 static inline const char *rtn_scope(enum rt_scope_t s)
2283 static char buf[32];
2286 case RT_SCOPE_UNIVERSE: return "universe";
2287 case RT_SCOPE_SITE: return "site";
2288 case RT_SCOPE_LINK: return "link";
2289 case RT_SCOPE_HOST: return "host";
2290 case RT_SCOPE_NOWHERE: return "nowhere";
2292 snprintf(buf, sizeof(buf), "scope=%d", s);
2297 static const char *rtn_type_names[__RTN_MAX] = {
2298 [RTN_UNSPEC] = "UNSPEC",
2299 [RTN_UNICAST] = "UNICAST",
2300 [RTN_LOCAL] = "LOCAL",
2301 [RTN_BROADCAST] = "BROADCAST",
2302 [RTN_ANYCAST] = "ANYCAST",
2303 [RTN_MULTICAST] = "MULTICAST",
2304 [RTN_BLACKHOLE] = "BLACKHOLE",
2305 [RTN_UNREACHABLE] = "UNREACHABLE",
2306 [RTN_PROHIBIT] = "PROHIBIT",
2307 [RTN_THROW] = "THROW",
2309 [RTN_XRESOLVE] = "XRESOLVE",
2312 static inline const char *rtn_type(unsigned t)
2314 static char buf[32];
2316 if (t < __RTN_MAX && rtn_type_names[t])
2317 return rtn_type_names[t];
2318 snprintf(buf, sizeof(buf), "type %u", t);
2322 /* Pretty print the trie */
2323 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2325 const struct fib_trie_iter *iter = seq->private;
2328 if (v == SEQ_START_TOKEN)
2331 if (!node_parent(n)) {
2332 if (iter->trie == iter->trie_local)
2333 seq_puts(seq, "<local>:\n");
2335 seq_puts(seq, "<main>:\n");
2339 struct tnode *tn = (struct tnode *) n;
2340 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2342 seq_indent(seq, iter->depth-1);
2343 seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n",
2344 NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2345 tn->empty_children);
2348 struct leaf *l = (struct leaf *) n;
2350 __be32 val = htonl(l->key);
2352 seq_indent(seq, iter->depth);
2353 seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
2354 for (i = 32; i >= 0; i--) {
2355 struct leaf_info *li = find_leaf_info(l, i);
2357 struct fib_alias *fa;
2358 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2359 seq_indent(seq, iter->depth+1);
2360 seq_printf(seq, " /%d %s %s", i,
2361 rtn_scope(fa->fa_scope),
2362 rtn_type(fa->fa_type));
2364 seq_printf(seq, "tos =%d\n",
2366 seq_putc(seq, '\n');
2375 static const struct seq_operations fib_trie_seq_ops = {
2376 .start = fib_trie_seq_start,
2377 .next = fib_trie_seq_next,
2378 .stop = fib_trie_seq_stop,
2379 .show = fib_trie_seq_show,
2382 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2384 return seq_open_net(inode, file, &fib_trie_seq_ops,
2385 sizeof(struct fib_trie_iter));
2388 static const struct file_operations fib_trie_fops = {
2389 .owner = THIS_MODULE,
2390 .open = fib_trie_seq_open,
2392 .llseek = seq_lseek,
2393 .release = seq_release_net,
2396 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2398 static unsigned type2flags[RTN_MAX + 1] = {
2399 [7] = RTF_REJECT, [8] = RTF_REJECT,
2401 unsigned flags = type2flags[type];
2403 if (fi && fi->fib_nh->nh_gw)
2404 flags |= RTF_GATEWAY;
2405 if (mask == htonl(0xFFFFFFFF))
2412 * This outputs /proc/net/route.
2413 * The format of the file is not supposed to be changed
2414 * and needs to be same as fib_hash output to avoid breaking
2417 static int fib_route_seq_show(struct seq_file *seq, void *v)
2419 const struct fib_trie_iter *iter = seq->private;
2424 if (v == SEQ_START_TOKEN) {
2425 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2426 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2431 if (iter->trie == iter->trie_local)
2436 for (i=32; i>=0; i--) {
2437 struct leaf_info *li = find_leaf_info(l, i);
2438 struct fib_alias *fa;
2439 __be32 mask, prefix;
2444 mask = inet_make_mask(li->plen);
2445 prefix = htonl(l->key);
2447 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2448 const struct fib_info *fi = fa->fa_info;
2449 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2451 if (fa->fa_type == RTN_BROADCAST
2452 || fa->fa_type == RTN_MULTICAST)
2456 snprintf(bf, sizeof(bf),
2457 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2458 fi->fib_dev ? fi->fib_dev->name : "*",
2460 fi->fib_nh->nh_gw, flags, 0, 0,
2463 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2467 snprintf(bf, sizeof(bf),
2468 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2469 prefix, 0, flags, 0, 0, 0,
2472 seq_printf(seq, "%-127s\n", bf);
2479 static const struct seq_operations fib_route_seq_ops = {
2480 .start = fib_trie_seq_start,
2481 .next = fib_trie_seq_next,
2482 .stop = fib_trie_seq_stop,
2483 .show = fib_route_seq_show,
2486 static int fib_route_seq_open(struct inode *inode, struct file *file)
2488 return seq_open_net(inode, file, &fib_route_seq_ops,
2489 sizeof(struct fib_trie_iter));
2492 static const struct file_operations fib_route_fops = {
2493 .owner = THIS_MODULE,
2494 .open = fib_route_seq_open,
2496 .llseek = seq_lseek,
2497 .release = seq_release_net,
2500 int __net_init fib_proc_init(struct net *net)
2502 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2505 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2506 &fib_triestat_fops))
2509 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2515 proc_net_remove(net, "fib_triestat");
2517 proc_net_remove(net, "fib_trie");
2522 void __net_exit fib_proc_exit(struct net *net)
2524 proc_net_remove(net, "fib_trie");
2525 proc_net_remove(net, "fib_triestat");
2526 proc_net_remove(net, "route");
2529 #endif /* CONFIG_PROC_FS */