2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
52 #include <linux/rtnetlink.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
58 #include <asm/uaccess.h>
61 #include <linux/sysctl.h>
64 /* Set to 3 to get tracing. */
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #define RT6_TRACE(x...) do { ; } while (0)
75 #define CLONE_OFFLINK_ROUTE 0
77 #define RT6_SELECT_F_IFACE 0x1
78 #define RT6_SELECT_F_REACHABLE 0x2
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(void);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct sk_buff *skb);
98 static void ip6_link_failure(struct sk_buff *skb);
99 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103 struct in6_addr *gwaddr, int ifindex,
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106 struct in6_addr *gwaddr, int ifindex);
109 static struct dst_ops ip6_dst_ops = {
111 .protocol = __constant_htons(ETH_P_IPV6),
114 .check = ip6_dst_check,
115 .destroy = ip6_dst_destroy,
116 .ifdown = ip6_dst_ifdown,
117 .negative_advice = ip6_negative_advice,
118 .link_failure = ip6_link_failure,
119 .update_pmtu = ip6_rt_update_pmtu,
120 .entry_size = sizeof(struct rt6_info),
123 struct rt6_info ip6_null_entry = {
126 .__refcnt = ATOMIC_INIT(1),
128 .dev = &loopback_dev,
130 .error = -ENETUNREACH,
131 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
132 .input = ip6_pkt_discard,
133 .output = ip6_pkt_discard_out,
135 .path = (struct dst_entry*)&ip6_null_entry,
138 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
139 .rt6i_metric = ~(u32) 0,
140 .rt6i_ref = ATOMIC_INIT(1),
143 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
145 struct rt6_info ip6_prohibit_entry = {
148 .__refcnt = ATOMIC_INIT(1),
150 .dev = &loopback_dev,
153 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
154 .input = ip6_pkt_discard,
155 .output = ip6_pkt_discard_out,
157 .path = (struct dst_entry*)&ip6_prohibit_entry,
160 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
161 .rt6i_metric = ~(u32) 0,
162 .rt6i_ref = ATOMIC_INIT(1),
165 struct rt6_info ip6_blk_hole_entry = {
168 .__refcnt = ATOMIC_INIT(1),
170 .dev = &loopback_dev,
173 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
174 .input = ip6_pkt_discard,
175 .output = ip6_pkt_discard_out,
177 .path = (struct dst_entry*)&ip6_blk_hole_entry,
180 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
181 .rt6i_metric = ~(u32) 0,
182 .rt6i_ref = ATOMIC_INIT(1),
187 /* allocate dst with ip6_dst_ops */
188 static __inline__ struct rt6_info *ip6_dst_alloc(void)
190 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
193 static void ip6_dst_destroy(struct dst_entry *dst)
195 struct rt6_info *rt = (struct rt6_info *)dst;
196 struct inet6_dev *idev = rt->rt6i_idev;
199 rt->rt6i_idev = NULL;
204 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
207 struct rt6_info *rt = (struct rt6_info *)dst;
208 struct inet6_dev *idev = rt->rt6i_idev;
210 if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
211 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
212 if (loopback_idev != NULL) {
213 rt->rt6i_idev = loopback_idev;
219 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
221 return (rt->rt6i_flags & RTF_EXPIRES &&
222 time_after(jiffies, rt->rt6i_expires));
225 static inline int rt6_need_strict(struct in6_addr *daddr)
227 return (ipv6_addr_type(daddr) &
228 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
232 * Route lookup. Any table->tb6_lock is implied.
235 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
239 struct rt6_info *local = NULL;
240 struct rt6_info *sprt;
243 for (sprt = rt; sprt; sprt = sprt->u.next) {
244 struct net_device *dev = sprt->rt6i_dev;
245 if (dev->ifindex == oif)
247 if (dev->flags & IFF_LOOPBACK) {
248 if (sprt->rt6i_idev == NULL ||
249 sprt->rt6i_idev->dev->ifindex != oif) {
252 if (local && (!oif ||
253 local->rt6i_idev->dev->ifindex == oif))
264 return &ip6_null_entry;
269 #ifdef CONFIG_IPV6_ROUTER_PREF
270 static void rt6_probe(struct rt6_info *rt)
272 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
274 * Okay, this does not seem to be appropriate
275 * for now, however, we need to check if it
276 * is really so; aka Router Reachability Probing.
278 * Router Reachability Probe MUST be rate-limited
279 * to no more than one per minute.
281 if (!neigh || (neigh->nud_state & NUD_VALID))
283 read_lock_bh(&neigh->lock);
284 if (!(neigh->nud_state & NUD_VALID) &&
285 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
286 struct in6_addr mcaddr;
287 struct in6_addr *target;
289 neigh->updated = jiffies;
290 read_unlock_bh(&neigh->lock);
292 target = (struct in6_addr *)&neigh->primary_key;
293 addrconf_addr_solict_mult(target, &mcaddr);
294 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
296 read_unlock_bh(&neigh->lock);
299 static inline void rt6_probe(struct rt6_info *rt)
306 * Default Router Selection (RFC 2461 6.3.6)
308 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
310 struct net_device *dev = rt->rt6i_dev;
311 if (!oif || dev->ifindex == oif)
313 if ((dev->flags & IFF_LOOPBACK) &&
314 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
319 static int inline rt6_check_neigh(struct rt6_info *rt)
321 struct neighbour *neigh = rt->rt6i_nexthop;
323 if (rt->rt6i_flags & RTF_NONEXTHOP ||
324 !(rt->rt6i_flags & RTF_GATEWAY))
327 read_lock_bh(&neigh->lock);
328 if (neigh->nud_state & NUD_VALID)
330 read_unlock_bh(&neigh->lock);
335 static int rt6_score_route(struct rt6_info *rt, int oif,
340 m = rt6_check_dev(rt, oif);
341 if (!m && (strict & RT6_SELECT_F_IFACE))
343 #ifdef CONFIG_IPV6_ROUTER_PREF
344 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
346 n = rt6_check_neigh(rt);
349 else if (!n && strict & RT6_SELECT_F_REACHABLE)
354 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
357 struct rt6_info *match = NULL, *last = NULL;
358 struct rt6_info *rt, *rt0 = *head;
362 RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
363 __FUNCTION__, head, head ? *head : NULL, oif);
365 for (rt = rt0, metric = rt0->rt6i_metric;
366 rt && rt->rt6i_metric == metric && (!last || rt != rt0);
370 if (rt6_check_expired(rt))
375 m = rt6_score_route(rt, oif, strict);
389 (strict & RT6_SELECT_F_REACHABLE) &&
390 last && last != rt0) {
391 /* no entries matched; do round-robin */
392 static DEFINE_SPINLOCK(lock);
395 rt0->u.next = last->u.next;
400 RT6_TRACE("%s() => %p, score=%d\n",
401 __FUNCTION__, match, mpri);
403 return (match ? match : &ip6_null_entry);
406 #ifdef CONFIG_IPV6_ROUTE_INFO
407 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
408 struct in6_addr *gwaddr)
410 struct route_info *rinfo = (struct route_info *) opt;
411 struct in6_addr prefix_buf, *prefix;
416 if (len < sizeof(struct route_info)) {
420 /* Sanity check for prefix_len and length */
421 if (rinfo->length > 3) {
423 } else if (rinfo->prefix_len > 128) {
425 } else if (rinfo->prefix_len > 64) {
426 if (rinfo->length < 2) {
429 } else if (rinfo->prefix_len > 0) {
430 if (rinfo->length < 1) {
435 pref = rinfo->route_pref;
436 if (pref == ICMPV6_ROUTER_PREF_INVALID)
437 pref = ICMPV6_ROUTER_PREF_MEDIUM;
439 lifetime = htonl(rinfo->lifetime);
440 if (lifetime == 0xffffffff) {
442 } else if (lifetime > 0x7fffffff/HZ) {
443 /* Avoid arithmetic overflow */
444 lifetime = 0x7fffffff/HZ - 1;
447 if (rinfo->length == 3)
448 prefix = (struct in6_addr *)rinfo->prefix;
450 /* this function is safe */
451 ipv6_addr_prefix(&prefix_buf,
452 (struct in6_addr *)rinfo->prefix,
454 prefix = &prefix_buf;
457 rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
459 if (rt && !lifetime) {
465 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
468 rt->rt6i_flags = RTF_ROUTEINFO |
469 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
472 if (lifetime == 0xffffffff) {
473 rt->rt6i_flags &= ~RTF_EXPIRES;
475 rt->rt6i_expires = jiffies + HZ * lifetime;
476 rt->rt6i_flags |= RTF_EXPIRES;
478 dst_release(&rt->u.dst);
484 #define BACKTRACK(saddr) \
486 if (rt == &ip6_null_entry) { \
487 struct fib6_node *pn; \
489 if (fn->fn_flags & RTN_TL_ROOT) \
492 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
493 fn = fib6_lookup(pn->subtree, NULL, saddr); \
496 if (fn->fn_flags & RTN_RTINFO) \
502 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
503 struct flowi *fl, int flags)
505 struct fib6_node *fn;
508 read_lock_bh(&table->tb6_lock);
509 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
512 rt = rt6_device_match(rt, fl->oif, flags & RT6_F_STRICT);
513 BACKTRACK(&fl->fl6_src);
514 dst_hold(&rt->u.dst);
516 read_unlock_bh(&table->tb6_lock);
518 rt->u.dst.lastuse = jiffies;
525 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
537 struct dst_entry *dst;
538 int flags = strict ? RT6_F_STRICT : 0;
540 dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
542 return (struct rt6_info *) dst;
549 /* ip6_ins_rt is called with FREE table->tb6_lock.
550 It takes new route entry, the addition fails by any reason the
551 route is freed. In any case, if caller does not hold it, it may
555 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
558 struct fib6_table *table;
560 table = rt->rt6i_table;
561 write_lock_bh(&table->tb6_lock);
562 err = fib6_add(&table->tb6_root, rt, info);
563 write_unlock_bh(&table->tb6_lock);
568 int ip6_ins_rt(struct rt6_info *rt)
570 return __ip6_ins_rt(rt, NULL);
573 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
574 struct in6_addr *saddr)
582 rt = ip6_rt_copy(ort);
585 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
586 if (rt->rt6i_dst.plen != 128 &&
587 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
588 rt->rt6i_flags |= RTF_ANYCAST;
589 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
592 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
593 rt->rt6i_dst.plen = 128;
594 rt->rt6i_flags |= RTF_CACHE;
595 rt->u.dst.flags |= DST_HOST;
597 #ifdef CONFIG_IPV6_SUBTREES
598 if (rt->rt6i_src.plen && saddr) {
599 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
600 rt->rt6i_src.plen = 128;
604 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
611 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
613 struct rt6_info *rt = ip6_rt_copy(ort);
615 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
616 rt->rt6i_dst.plen = 128;
617 rt->rt6i_flags |= RTF_CACHE;
618 if (rt->rt6i_flags & RTF_REJECT)
619 rt->u.dst.error = ort->u.dst.error;
620 rt->u.dst.flags |= DST_HOST;
621 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
626 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
627 struct flowi *fl, int flags)
629 struct fib6_node *fn;
630 struct rt6_info *rt, *nrt;
634 int reachable = RT6_SELECT_F_REACHABLE;
636 if (flags & RT6_F_STRICT)
637 strict = RT6_SELECT_F_IFACE;
640 read_lock_bh(&table->tb6_lock);
643 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
646 rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
647 BACKTRACK(&fl->fl6_src);
648 if (rt == &ip6_null_entry ||
649 rt->rt6i_flags & RTF_CACHE)
652 dst_hold(&rt->u.dst);
653 read_unlock_bh(&table->tb6_lock);
655 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
656 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
658 #if CLONE_OFFLINK_ROUTE
659 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
665 dst_release(&rt->u.dst);
666 rt = nrt ? : &ip6_null_entry;
668 dst_hold(&rt->u.dst);
670 err = ip6_ins_rt(nrt);
679 * Race condition! In the gap, when table->tb6_lock was
680 * released someone could insert this route. Relookup.
682 dst_release(&rt->u.dst);
690 dst_hold(&rt->u.dst);
691 read_unlock_bh(&table->tb6_lock);
693 rt->u.dst.lastuse = jiffies;
699 void ip6_route_input(struct sk_buff *skb)
701 struct ipv6hdr *iph = skb->nh.ipv6h;
703 .iif = skb->dev->ifindex,
708 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
711 .proto = iph->nexthdr,
715 if (rt6_need_strict(&iph->daddr))
716 flags |= RT6_F_STRICT;
718 skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
721 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
722 struct flowi *fl, int flags)
724 struct fib6_node *fn;
725 struct rt6_info *rt, *nrt;
729 int reachable = RT6_SELECT_F_REACHABLE;
731 if (flags & RT6_F_STRICT)
732 strict = RT6_SELECT_F_IFACE;
735 read_lock_bh(&table->tb6_lock);
738 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
741 rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
742 BACKTRACK(&fl->fl6_src);
743 if (rt == &ip6_null_entry ||
744 rt->rt6i_flags & RTF_CACHE)
747 dst_hold(&rt->u.dst);
748 read_unlock_bh(&table->tb6_lock);
750 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
751 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
753 #if CLONE_OFFLINK_ROUTE
754 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
760 dst_release(&rt->u.dst);
761 rt = nrt ? : &ip6_null_entry;
763 dst_hold(&rt->u.dst);
765 err = ip6_ins_rt(nrt);
774 * Race condition! In the gap, when table->tb6_lock was
775 * released someone could insert this route. Relookup.
777 dst_release(&rt->u.dst);
785 dst_hold(&rt->u.dst);
786 read_unlock_bh(&table->tb6_lock);
788 rt->u.dst.lastuse = jiffies;
793 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
797 if (rt6_need_strict(&fl->fl6_dst))
798 flags |= RT6_F_STRICT;
800 return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
805 * Destination cache support functions
808 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
812 rt = (struct rt6_info *) dst;
814 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
820 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
822 struct rt6_info *rt = (struct rt6_info *) dst;
825 if (rt->rt6i_flags & RTF_CACHE)
833 static void ip6_link_failure(struct sk_buff *skb)
837 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
839 rt = (struct rt6_info *) skb->dst;
841 if (rt->rt6i_flags&RTF_CACHE) {
842 dst_set_expires(&rt->u.dst, 0);
843 rt->rt6i_flags |= RTF_EXPIRES;
844 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
845 rt->rt6i_node->fn_sernum = -1;
849 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
851 struct rt6_info *rt6 = (struct rt6_info*)dst;
853 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
854 rt6->rt6i_flags |= RTF_MODIFIED;
855 if (mtu < IPV6_MIN_MTU) {
857 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
859 dst->metrics[RTAX_MTU-1] = mtu;
860 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
864 static int ipv6_get_mtu(struct net_device *dev);
866 static inline unsigned int ipv6_advmss(unsigned int mtu)
868 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
870 if (mtu < ip6_rt_min_advmss)
871 mtu = ip6_rt_min_advmss;
874 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
875 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
876 * IPV6_MAXPLEN is also valid and means: "any MSS,
877 * rely only on pmtu discovery"
879 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
884 static struct dst_entry *ndisc_dst_gc_list;
885 static DEFINE_SPINLOCK(ndisc_lock);
887 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
888 struct neighbour *neigh,
889 struct in6_addr *addr,
890 int (*output)(struct sk_buff *))
893 struct inet6_dev *idev = in6_dev_get(dev);
895 if (unlikely(idev == NULL))
898 rt = ip6_dst_alloc();
899 if (unlikely(rt == NULL)) {
908 neigh = ndisc_get_neigh(dev, addr);
911 rt->rt6i_idev = idev;
912 rt->rt6i_nexthop = neigh;
913 atomic_set(&rt->u.dst.__refcnt, 1);
914 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
915 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
916 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
917 rt->u.dst.output = output;
919 #if 0 /* there's no chance to use these for ndisc */
920 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
923 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
924 rt->rt6i_dst.plen = 128;
927 spin_lock_bh(&ndisc_lock);
928 rt->u.dst.next = ndisc_dst_gc_list;
929 ndisc_dst_gc_list = &rt->u.dst;
930 spin_unlock_bh(&ndisc_lock);
932 fib6_force_start_gc();
935 return (struct dst_entry *)rt;
938 int ndisc_dst_gc(int *more)
940 struct dst_entry *dst, *next, **pprev;
946 spin_lock_bh(&ndisc_lock);
947 pprev = &ndisc_dst_gc_list;
949 while ((dst = *pprev) != NULL) {
950 if (!atomic_read(&dst->__refcnt)) {
960 spin_unlock_bh(&ndisc_lock);
965 static int ip6_dst_gc(void)
967 static unsigned expire = 30*HZ;
968 static unsigned long last_gc;
969 unsigned long now = jiffies;
971 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
972 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
978 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
979 expire = ip6_rt_gc_timeout>>1;
982 expire -= expire>>ip6_rt_gc_elasticity;
983 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
986 /* Clean host part of a prefix. Not necessary in radix tree,
987 but results in cleaner routing tables.
989 Remove it only when all the things will work!
992 static int ipv6_get_mtu(struct net_device *dev)
994 int mtu = IPV6_MIN_MTU;
995 struct inet6_dev *idev;
997 idev = in6_dev_get(dev);
999 mtu = idev->cnf.mtu6;
1005 int ipv6_get_hoplimit(struct net_device *dev)
1007 int hoplimit = ipv6_devconf.hop_limit;
1008 struct inet6_dev *idev;
1010 idev = in6_dev_get(dev);
1012 hoplimit = idev->cnf.hop_limit;
1022 int ip6_route_add(struct fib6_config *cfg)
1025 struct rt6_info *rt = NULL;
1026 struct net_device *dev = NULL;
1027 struct inet6_dev *idev = NULL;
1028 struct fib6_table *table;
1031 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1033 #ifndef CONFIG_IPV6_SUBTREES
1034 if (cfg->fc_src_len)
1037 if (cfg->fc_ifindex) {
1039 dev = dev_get_by_index(cfg->fc_ifindex);
1042 idev = in6_dev_get(dev);
1047 if (cfg->fc_metric == 0)
1048 cfg->fc_metric = IP6_RT_PRIO_USER;
1050 table = fib6_new_table(cfg->fc_table);
1051 if (table == NULL) {
1056 rt = ip6_dst_alloc();
1063 rt->u.dst.obsolete = -1;
1064 rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1066 if (cfg->fc_protocol == RTPROT_UNSPEC)
1067 cfg->fc_protocol = RTPROT_BOOT;
1068 rt->rt6i_protocol = cfg->fc_protocol;
1070 addr_type = ipv6_addr_type(&cfg->fc_dst);
1072 if (addr_type & IPV6_ADDR_MULTICAST)
1073 rt->u.dst.input = ip6_mc_input;
1075 rt->u.dst.input = ip6_forward;
1077 rt->u.dst.output = ip6_output;
1079 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1080 rt->rt6i_dst.plen = cfg->fc_dst_len;
1081 if (rt->rt6i_dst.plen == 128)
1082 rt->u.dst.flags = DST_HOST;
1084 #ifdef CONFIG_IPV6_SUBTREES
1085 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1086 rt->rt6i_src.plen = cfg->fc_src_len;
1089 rt->rt6i_metric = cfg->fc_metric;
1091 /* We cannot add true routes via loopback here,
1092 they would result in kernel looping; promote them to reject routes
1094 if ((cfg->fc_flags & RTF_REJECT) ||
1095 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1096 /* hold loopback dev/idev if we haven't done so. */
1097 if (dev != &loopback_dev) {
1102 dev = &loopback_dev;
1104 idev = in6_dev_get(dev);
1110 rt->u.dst.output = ip6_pkt_discard_out;
1111 rt->u.dst.input = ip6_pkt_discard;
1112 rt->u.dst.error = -ENETUNREACH;
1113 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1117 if (cfg->fc_flags & RTF_GATEWAY) {
1118 struct in6_addr *gw_addr;
1121 gw_addr = &cfg->fc_gateway;
1122 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1123 gwa_type = ipv6_addr_type(gw_addr);
1125 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1126 struct rt6_info *grt;
1128 /* IPv6 strictly inhibits using not link-local
1129 addresses as nexthop address.
1130 Otherwise, router will not able to send redirects.
1131 It is very good, but in some (rare!) circumstances
1132 (SIT, PtP, NBMA NOARP links) it is handy to allow
1133 some exceptions. --ANK
1136 if (!(gwa_type&IPV6_ADDR_UNICAST))
1139 grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1141 err = -EHOSTUNREACH;
1145 if (dev != grt->rt6i_dev) {
1146 dst_release(&grt->u.dst);
1150 dev = grt->rt6i_dev;
1151 idev = grt->rt6i_idev;
1153 in6_dev_hold(grt->rt6i_idev);
1155 if (!(grt->rt6i_flags&RTF_GATEWAY))
1157 dst_release(&grt->u.dst);
1163 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1171 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1172 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1173 if (IS_ERR(rt->rt6i_nexthop)) {
1174 err = PTR_ERR(rt->rt6i_nexthop);
1175 rt->rt6i_nexthop = NULL;
1180 rt->rt6i_flags = cfg->fc_flags;
1187 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1188 int type = nla->nla_type;
1191 if (type > RTAX_MAX) {
1196 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1201 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1202 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1203 if (!rt->u.dst.metrics[RTAX_MTU-1])
1204 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1205 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1206 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1207 rt->u.dst.dev = dev;
1208 rt->rt6i_idev = idev;
1209 rt->rt6i_table = table;
1210 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1218 dst_free((struct dst_entry *) rt);
1222 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1225 struct fib6_table *table;
1227 if (rt == &ip6_null_entry)
1230 table = rt->rt6i_table;
1231 write_lock_bh(&table->tb6_lock);
1233 err = fib6_del(rt, info);
1234 dst_release(&rt->u.dst);
1236 write_unlock_bh(&table->tb6_lock);
1241 int ip6_del_rt(struct rt6_info *rt)
1243 return __ip6_del_rt(rt, NULL);
1246 static int ip6_route_del(struct fib6_config *cfg)
1248 struct fib6_table *table;
1249 struct fib6_node *fn;
1250 struct rt6_info *rt;
1253 table = fib6_get_table(cfg->fc_table);
1257 read_lock_bh(&table->tb6_lock);
1259 fn = fib6_locate(&table->tb6_root,
1260 &cfg->fc_dst, cfg->fc_dst_len,
1261 &cfg->fc_src, cfg->fc_src_len);
1264 for (rt = fn->leaf; rt; rt = rt->u.next) {
1265 if (cfg->fc_ifindex &&
1266 (rt->rt6i_dev == NULL ||
1267 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1269 if (cfg->fc_flags & RTF_GATEWAY &&
1270 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1272 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1274 dst_hold(&rt->u.dst);
1275 read_unlock_bh(&table->tb6_lock);
1277 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1280 read_unlock_bh(&table->tb6_lock);
1288 struct ip6rd_flowi {
1290 struct in6_addr gateway;
1293 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1297 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1298 struct rt6_info *rt;
1299 struct fib6_node *fn;
1302 * Get the "current" route for this destination and
1303 * check if the redirect has come from approriate router.
1305 * RFC 2461 specifies that redirects should only be
1306 * accepted if they come from the nexthop to the target.
1307 * Due to the way the routes are chosen, this notion
1308 * is a bit fuzzy and one might need to check all possible
1312 read_lock_bh(&table->tb6_lock);
1313 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1315 for (rt = fn->leaf; rt; rt = rt->u.next) {
1317 * Current route is on-link; redirect is always invalid.
1319 * Seems, previous statement is not true. It could
1320 * be node, which looks for us as on-link (f.e. proxy ndisc)
1321 * But then router serving it might decide, that we should
1322 * know truth 8)8) --ANK (980726).
1324 if (rt6_check_expired(rt))
1326 if (!(rt->rt6i_flags & RTF_GATEWAY))
1328 if (fl->oif != rt->rt6i_dev->ifindex)
1330 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1336 rt = &ip6_null_entry;
1337 BACKTRACK(&fl->fl6_src);
1339 dst_hold(&rt->u.dst);
1341 read_unlock_bh(&table->tb6_lock);
1346 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1347 struct in6_addr *src,
1348 struct in6_addr *gateway,
1349 struct net_device *dev)
1351 struct ip6rd_flowi rdfl = {
1353 .oif = dev->ifindex,
1361 .gateway = *gateway,
1363 int flags = rt6_need_strict(dest) ? RT6_F_STRICT : 0;
1365 return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1368 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1369 struct in6_addr *saddr,
1370 struct neighbour *neigh, u8 *lladdr, int on_link)
1372 struct rt6_info *rt, *nrt = NULL;
1373 struct netevent_redirect netevent;
1375 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1377 if (rt == &ip6_null_entry) {
1378 if (net_ratelimit())
1379 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1380 "for redirect target\n");
1385 * We have finally decided to accept it.
1388 neigh_update(neigh, lladdr, NUD_STALE,
1389 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1390 NEIGH_UPDATE_F_OVERRIDE|
1391 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1392 NEIGH_UPDATE_F_ISROUTER))
1396 * Redirect received -> path was valid.
1397 * Look, redirects are sent only in response to data packets,
1398 * so that this nexthop apparently is reachable. --ANK
1400 dst_confirm(&rt->u.dst);
1402 /* Duplicate redirect: silently ignore. */
1403 if (neigh == rt->u.dst.neighbour)
1406 nrt = ip6_rt_copy(rt);
1410 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1412 nrt->rt6i_flags &= ~RTF_GATEWAY;
1414 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1415 nrt->rt6i_dst.plen = 128;
1416 nrt->u.dst.flags |= DST_HOST;
1418 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1419 nrt->rt6i_nexthop = neigh_clone(neigh);
1420 /* Reset pmtu, it may be better */
1421 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1422 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1424 if (ip6_ins_rt(nrt))
1427 netevent.old = &rt->u.dst;
1428 netevent.new = &nrt->u.dst;
1429 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1431 if (rt->rt6i_flags&RTF_CACHE) {
1437 dst_release(&rt->u.dst);
1442 * Handle ICMP "packet too big" messages
1443 * i.e. Path MTU discovery
1446 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1447 struct net_device *dev, u32 pmtu)
1449 struct rt6_info *rt, *nrt;
1452 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1456 if (pmtu >= dst_mtu(&rt->u.dst))
1459 if (pmtu < IPV6_MIN_MTU) {
1461 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1462 * MTU (1280) and a fragment header should always be included
1463 * after a node receiving Too Big message reporting PMTU is
1464 * less than the IPv6 Minimum Link MTU.
1466 pmtu = IPV6_MIN_MTU;
1470 /* New mtu received -> path was valid.
1471 They are sent only in response to data packets,
1472 so that this nexthop apparently is reachable. --ANK
1474 dst_confirm(&rt->u.dst);
1476 /* Host route. If it is static, it would be better
1477 not to override it, but add new one, so that
1478 when cache entry will expire old pmtu
1479 would return automatically.
1481 if (rt->rt6i_flags & RTF_CACHE) {
1482 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1484 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1485 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1486 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1491 Two cases are possible:
1492 1. It is connected route. Action: COW
1493 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1495 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1496 nrt = rt6_alloc_cow(rt, daddr, saddr);
1498 nrt = rt6_alloc_clone(rt, daddr);
1501 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1503 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1505 /* According to RFC 1981, detecting PMTU increase shouldn't be
1506 * happened within 5 mins, the recommended timer is 10 mins.
1507 * Here this route expiration time is set to ip6_rt_mtu_expires
1508 * which is 10 mins. After 10 mins the decreased pmtu is expired
1509 * and detecting PMTU increase will be automatically happened.
1511 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1512 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1517 dst_release(&rt->u.dst);
1521 * Misc support functions
1524 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1526 struct rt6_info *rt = ip6_dst_alloc();
1529 rt->u.dst.input = ort->u.dst.input;
1530 rt->u.dst.output = ort->u.dst.output;
1532 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1533 rt->u.dst.dev = ort->u.dst.dev;
1535 dev_hold(rt->u.dst.dev);
1536 rt->rt6i_idev = ort->rt6i_idev;
1538 in6_dev_hold(rt->rt6i_idev);
1539 rt->u.dst.lastuse = jiffies;
1540 rt->rt6i_expires = 0;
1542 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1543 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1544 rt->rt6i_metric = 0;
1546 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1547 #ifdef CONFIG_IPV6_SUBTREES
1548 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1550 rt->rt6i_table = ort->rt6i_table;
1555 #ifdef CONFIG_IPV6_ROUTE_INFO
1556 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1557 struct in6_addr *gwaddr, int ifindex)
1559 struct fib6_node *fn;
1560 struct rt6_info *rt = NULL;
1561 struct fib6_table *table;
1563 table = fib6_get_table(RT6_TABLE_INFO);
1567 write_lock_bh(&table->tb6_lock);
1568 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1572 for (rt = fn->leaf; rt; rt = rt->u.next) {
1573 if (rt->rt6i_dev->ifindex != ifindex)
1575 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1577 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1579 dst_hold(&rt->u.dst);
1583 write_unlock_bh(&table->tb6_lock);
1587 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1588 struct in6_addr *gwaddr, int ifindex,
1591 struct fib6_config cfg = {
1592 .fc_table = RT6_TABLE_INFO,
1594 .fc_ifindex = ifindex,
1595 .fc_dst_len = prefixlen,
1596 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1597 RTF_UP | RTF_PREF(pref),
1600 ipv6_addr_copy(&cfg.fc_dst, prefix);
1601 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1603 /* We should treat it as a default route if prefix length is 0. */
1605 cfg.fc_flags |= RTF_DEFAULT;
1607 ip6_route_add(&cfg);
1609 return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1613 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1615 struct rt6_info *rt;
1616 struct fib6_table *table;
1618 table = fib6_get_table(RT6_TABLE_DFLT);
1622 write_lock_bh(&table->tb6_lock);
1623 for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1624 if (dev == rt->rt6i_dev &&
1625 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1626 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1630 dst_hold(&rt->u.dst);
1631 write_unlock_bh(&table->tb6_lock);
1635 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1636 struct net_device *dev,
1639 struct fib6_config cfg = {
1640 .fc_table = RT6_TABLE_DFLT,
1642 .fc_ifindex = dev->ifindex,
1643 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1644 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1647 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1649 ip6_route_add(&cfg);
1651 return rt6_get_dflt_router(gwaddr, dev);
1654 void rt6_purge_dflt_routers(void)
1656 struct rt6_info *rt;
1657 struct fib6_table *table;
1659 /* NOTE: Keep consistent with rt6_get_dflt_router */
1660 table = fib6_get_table(RT6_TABLE_DFLT);
1665 read_lock_bh(&table->tb6_lock);
1666 for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1667 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1668 dst_hold(&rt->u.dst);
1669 read_unlock_bh(&table->tb6_lock);
1674 read_unlock_bh(&table->tb6_lock);
1677 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1678 struct fib6_config *cfg)
1680 memset(cfg, 0, sizeof(*cfg));
1682 cfg->fc_table = RT6_TABLE_MAIN;
1683 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1684 cfg->fc_metric = rtmsg->rtmsg_metric;
1685 cfg->fc_expires = rtmsg->rtmsg_info;
1686 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1687 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1688 cfg->fc_flags = rtmsg->rtmsg_flags;
1690 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1691 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1692 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1695 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1697 struct fib6_config cfg;
1698 struct in6_rtmsg rtmsg;
1702 case SIOCADDRT: /* Add a route */
1703 case SIOCDELRT: /* Delete a route */
1704 if (!capable(CAP_NET_ADMIN))
1706 err = copy_from_user(&rtmsg, arg,
1707 sizeof(struct in6_rtmsg));
1711 rtmsg_to_fib6_config(&rtmsg, &cfg);
1716 err = ip6_route_add(&cfg);
1719 err = ip6_route_del(&cfg);
1733 * Drop the packet on the floor
1736 static int ip6_pkt_discard(struct sk_buff *skb)
1738 int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1739 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1740 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1742 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1743 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1748 static int ip6_pkt_discard_out(struct sk_buff *skb)
1750 skb->dev = skb->dst->dev;
1751 return ip6_pkt_discard(skb);
1755 * Allocate a dst for local (unicast / anycast) address.
1758 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1759 const struct in6_addr *addr,
1762 struct rt6_info *rt = ip6_dst_alloc();
1765 return ERR_PTR(-ENOMEM);
1767 dev_hold(&loopback_dev);
1770 rt->u.dst.flags = DST_HOST;
1771 rt->u.dst.input = ip6_input;
1772 rt->u.dst.output = ip6_output;
1773 rt->rt6i_dev = &loopback_dev;
1774 rt->rt6i_idev = idev;
1775 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1776 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1777 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1778 rt->u.dst.obsolete = -1;
1780 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1782 rt->rt6i_flags |= RTF_ANYCAST;
1784 rt->rt6i_flags |= RTF_LOCAL;
1785 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1786 if (rt->rt6i_nexthop == NULL) {
1787 dst_free((struct dst_entry *) rt);
1788 return ERR_PTR(-ENOMEM);
1791 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1792 rt->rt6i_dst.plen = 128;
1793 rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1795 atomic_set(&rt->u.dst.__refcnt, 1);
1800 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1802 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1803 rt != &ip6_null_entry) {
1804 RT6_TRACE("deleted by ifdown %p\n", rt);
1810 void rt6_ifdown(struct net_device *dev)
1812 fib6_clean_all(fib6_ifdown, 0, dev);
1815 struct rt6_mtu_change_arg
1817 struct net_device *dev;
1821 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1823 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1824 struct inet6_dev *idev;
1826 /* In IPv6 pmtu discovery is not optional,
1827 so that RTAX_MTU lock cannot disable it.
1828 We still use this lock to block changes
1829 caused by addrconf/ndisc.
1832 idev = __in6_dev_get(arg->dev);
1836 /* For administrative MTU increase, there is no way to discover
1837 IPv6 PMTU increase, so PMTU increase should be updated here.
1838 Since RFC 1981 doesn't include administrative MTU increase
1839 update PMTU increase is a MUST. (i.e. jumbo frame)
1842 If new MTU is less than route PMTU, this new MTU will be the
1843 lowest MTU in the path, update the route PMTU to reflect PMTU
1844 decreases; if new MTU is greater than route PMTU, and the
1845 old MTU is the lowest MTU in the path, update the route PMTU
1846 to reflect the increase. In this case if the other nodes' MTU
1847 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1850 if (rt->rt6i_dev == arg->dev &&
1851 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1852 (dst_mtu(&rt->u.dst) > arg->mtu ||
1853 (dst_mtu(&rt->u.dst) < arg->mtu &&
1854 dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1855 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1856 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1860 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1862 struct rt6_mtu_change_arg arg = {
1867 fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1870 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1871 [RTA_GATEWAY] = { .minlen = sizeof(struct in6_addr) },
1872 [RTA_OIF] = { .type = NLA_U32 },
1873 [RTA_IIF] = { .type = NLA_U32 },
1874 [RTA_PRIORITY] = { .type = NLA_U32 },
1875 [RTA_METRICS] = { .type = NLA_NESTED },
1878 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1879 struct fib6_config *cfg)
1882 struct nlattr *tb[RTA_MAX+1];
1885 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1890 rtm = nlmsg_data(nlh);
1891 memset(cfg, 0, sizeof(*cfg));
1893 cfg->fc_table = rtm->rtm_table;
1894 cfg->fc_dst_len = rtm->rtm_dst_len;
1895 cfg->fc_src_len = rtm->rtm_src_len;
1896 cfg->fc_flags = RTF_UP;
1897 cfg->fc_protocol = rtm->rtm_protocol;
1899 if (rtm->rtm_type == RTN_UNREACHABLE)
1900 cfg->fc_flags |= RTF_REJECT;
1902 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1903 cfg->fc_nlinfo.nlh = nlh;
1905 if (tb[RTA_GATEWAY]) {
1906 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1907 cfg->fc_flags |= RTF_GATEWAY;
1911 int plen = (rtm->rtm_dst_len + 7) >> 3;
1913 if (nla_len(tb[RTA_DST]) < plen)
1916 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1920 int plen = (rtm->rtm_src_len + 7) >> 3;
1922 if (nla_len(tb[RTA_SRC]) < plen)
1925 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1929 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1931 if (tb[RTA_PRIORITY])
1932 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1934 if (tb[RTA_METRICS]) {
1935 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1936 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1940 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1947 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1949 struct fib6_config cfg;
1952 err = rtm_to_fib6_config(skb, nlh, &cfg);
1956 return ip6_route_del(&cfg);
1959 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1961 struct fib6_config cfg;
1964 err = rtm_to_fib6_config(skb, nlh, &cfg);
1968 return ip6_route_add(&cfg);
1971 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1972 struct in6_addr *dst, struct in6_addr *src,
1973 int iif, int type, u32 pid, u32 seq,
1974 int prefix, unsigned int flags)
1977 struct nlmsghdr *nlh;
1978 struct rta_cacheinfo ci;
1981 if (prefix) { /* user wants prefix routes only */
1982 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1983 /* success since this is not a prefix route */
1988 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
1992 rtm = nlmsg_data(nlh);
1993 rtm->rtm_family = AF_INET6;
1994 rtm->rtm_dst_len = rt->rt6i_dst.plen;
1995 rtm->rtm_src_len = rt->rt6i_src.plen;
1998 table = rt->rt6i_table->tb6_id;
2000 table = RT6_TABLE_UNSPEC;
2001 rtm->rtm_table = table;
2002 NLA_PUT_U32(skb, RTA_TABLE, table);
2003 if (rt->rt6i_flags&RTF_REJECT)
2004 rtm->rtm_type = RTN_UNREACHABLE;
2005 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2006 rtm->rtm_type = RTN_LOCAL;
2008 rtm->rtm_type = RTN_UNICAST;
2010 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2011 rtm->rtm_protocol = rt->rt6i_protocol;
2012 if (rt->rt6i_flags&RTF_DYNAMIC)
2013 rtm->rtm_protocol = RTPROT_REDIRECT;
2014 else if (rt->rt6i_flags & RTF_ADDRCONF)
2015 rtm->rtm_protocol = RTPROT_KERNEL;
2016 else if (rt->rt6i_flags&RTF_DEFAULT)
2017 rtm->rtm_protocol = RTPROT_RA;
2019 if (rt->rt6i_flags&RTF_CACHE)
2020 rtm->rtm_flags |= RTM_F_CLONED;
2023 NLA_PUT(skb, RTA_DST, 16, dst);
2024 rtm->rtm_dst_len = 128;
2025 } else if (rtm->rtm_dst_len)
2026 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2027 #ifdef CONFIG_IPV6_SUBTREES
2029 NLA_PUT(skb, RTA_SRC, 16, src);
2030 rtm->rtm_src_len = 128;
2031 } else if (rtm->rtm_src_len)
2032 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2035 NLA_PUT_U32(skb, RTA_IIF, iif);
2037 struct in6_addr saddr_buf;
2038 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2039 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2042 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2043 goto nla_put_failure;
2045 if (rt->u.dst.neighbour)
2046 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2049 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2051 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2052 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2053 if (rt->rt6i_expires)
2054 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
2057 ci.rta_used = rt->u.dst.__use;
2058 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2059 ci.rta_error = rt->u.dst.error;
2063 NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2065 return nlmsg_end(skb, nlh);
2068 return nlmsg_cancel(skb, nlh);
2071 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2073 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2076 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2077 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2078 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2082 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2083 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2084 prefix, NLM_F_MULTI);
2087 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2089 struct nlattr *tb[RTA_MAX+1];
2090 struct rt6_info *rt;
2091 struct sk_buff *skb;
2096 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2101 memset(&fl, 0, sizeof(fl));
2104 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2107 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2111 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2114 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2118 iif = nla_get_u32(tb[RTA_IIF]);
2121 fl.oif = nla_get_u32(tb[RTA_OIF]);
2124 struct net_device *dev;
2125 dev = __dev_get_by_index(iif);
2132 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2138 /* Reserve room for dummy headers, this skb can pass
2139 through good chunk of routing engine.
2141 skb->mac.raw = skb->data;
2142 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2144 rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2145 skb->dst = &rt->u.dst;
2147 err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2148 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2149 nlh->nlmsg_seq, 0, 0);
2155 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2160 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2162 struct sk_buff *skb;
2163 u32 pid = 0, seq = 0;
2164 struct nlmsghdr *nlh = NULL;
2165 int payload = sizeof(struct rtmsg) + 256;
2172 seq = nlh->nlmsg_seq;
2175 skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2179 err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2185 err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2188 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2195 #ifdef CONFIG_PROC_FS
2197 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2208 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2210 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2213 if (arg->skip < arg->offset / RT6_INFO_LEN) {
2218 if (arg->len >= arg->length)
2221 for (i=0; i<16; i++) {
2222 sprintf(arg->buffer + arg->len, "%02x",
2223 rt->rt6i_dst.addr.s6_addr[i]);
2226 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2229 #ifdef CONFIG_IPV6_SUBTREES
2230 for (i=0; i<16; i++) {
2231 sprintf(arg->buffer + arg->len, "%02x",
2232 rt->rt6i_src.addr.s6_addr[i]);
2235 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2238 sprintf(arg->buffer + arg->len,
2239 "00000000000000000000000000000000 00 ");
2243 if (rt->rt6i_nexthop) {
2244 for (i=0; i<16; i++) {
2245 sprintf(arg->buffer + arg->len, "%02x",
2246 rt->rt6i_nexthop->primary_key[i]);
2250 sprintf(arg->buffer + arg->len,
2251 "00000000000000000000000000000000");
2254 arg->len += sprintf(arg->buffer + arg->len,
2255 " %08x %08x %08x %08x %8s\n",
2256 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2257 rt->u.dst.__use, rt->rt6i_flags,
2258 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2262 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2264 struct rt6_proc_arg arg = {
2270 fib6_clean_all(rt6_info_route, 0, &arg);
2274 *start += offset % RT6_INFO_LEN;
2276 arg.len -= offset % RT6_INFO_LEN;
2278 if (arg.len > length)
2286 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2288 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2289 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2290 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2291 rt6_stats.fib_rt_cache,
2292 atomic_read(&ip6_dst_ops.entries),
2293 rt6_stats.fib_discarded_routes);
2298 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2300 return single_open(file, rt6_stats_seq_show, NULL);
2303 static struct file_operations rt6_stats_seq_fops = {
2304 .owner = THIS_MODULE,
2305 .open = rt6_stats_seq_open,
2307 .llseek = seq_lseek,
2308 .release = single_release,
2310 #endif /* CONFIG_PROC_FS */
2312 #ifdef CONFIG_SYSCTL
2314 static int flush_delay;
2317 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2318 void __user *buffer, size_t *lenp, loff_t *ppos)
2321 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2322 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2328 ctl_table ipv6_route_table[] = {
2330 .ctl_name = NET_IPV6_ROUTE_FLUSH,
2331 .procname = "flush",
2332 .data = &flush_delay,
2333 .maxlen = sizeof(int),
2335 .proc_handler = &ipv6_sysctl_rtcache_flush
2338 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
2339 .procname = "gc_thresh",
2340 .data = &ip6_dst_ops.gc_thresh,
2341 .maxlen = sizeof(int),
2343 .proc_handler = &proc_dointvec,
2346 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
2347 .procname = "max_size",
2348 .data = &ip6_rt_max_size,
2349 .maxlen = sizeof(int),
2351 .proc_handler = &proc_dointvec,
2354 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2355 .procname = "gc_min_interval",
2356 .data = &ip6_rt_gc_min_interval,
2357 .maxlen = sizeof(int),
2359 .proc_handler = &proc_dointvec_jiffies,
2360 .strategy = &sysctl_jiffies,
2363 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
2364 .procname = "gc_timeout",
2365 .data = &ip6_rt_gc_timeout,
2366 .maxlen = sizeof(int),
2368 .proc_handler = &proc_dointvec_jiffies,
2369 .strategy = &sysctl_jiffies,
2372 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2373 .procname = "gc_interval",
2374 .data = &ip6_rt_gc_interval,
2375 .maxlen = sizeof(int),
2377 .proc_handler = &proc_dointvec_jiffies,
2378 .strategy = &sysctl_jiffies,
2381 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2382 .procname = "gc_elasticity",
2383 .data = &ip6_rt_gc_elasticity,
2384 .maxlen = sizeof(int),
2386 .proc_handler = &proc_dointvec_jiffies,
2387 .strategy = &sysctl_jiffies,
2390 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2391 .procname = "mtu_expires",
2392 .data = &ip6_rt_mtu_expires,
2393 .maxlen = sizeof(int),
2395 .proc_handler = &proc_dointvec_jiffies,
2396 .strategy = &sysctl_jiffies,
2399 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2400 .procname = "min_adv_mss",
2401 .data = &ip6_rt_min_advmss,
2402 .maxlen = sizeof(int),
2404 .proc_handler = &proc_dointvec_jiffies,
2405 .strategy = &sysctl_jiffies,
2408 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2409 .procname = "gc_min_interval_ms",
2410 .data = &ip6_rt_gc_min_interval,
2411 .maxlen = sizeof(int),
2413 .proc_handler = &proc_dointvec_ms_jiffies,
2414 .strategy = &sysctl_ms_jiffies,
2421 void __init ip6_route_init(void)
2423 struct proc_dir_entry *p;
2425 ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2426 sizeof(struct rt6_info),
2427 0, SLAB_HWCACHE_ALIGN,
2429 if (!ip6_dst_ops.kmem_cachep)
2430 panic("cannot create ip6_dst_cache");
2433 #ifdef CONFIG_PROC_FS
2434 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2436 p->owner = THIS_MODULE;
2438 proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2443 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2448 void ip6_route_cleanup(void)
2450 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2451 fib6_rules_cleanup();
2453 #ifdef CONFIG_PROC_FS
2454 proc_net_remove("ipv6_route");
2455 proc_net_remove("rt6_stats");
2462 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);