2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
52 #include <linux/rtnetlink.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
58 #include <asm/uaccess.h>
61 #include <linux/sysctl.h>
64 /* Set to 3 to get tracing. */
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #define RT6_TRACE(x...) do { ; } while (0)
75 #define CLONE_OFFLINK_ROUTE 0
77 #define RT6_SELECT_F_IFACE 0x1
78 #define RT6_SELECT_F_REACHABLE 0x2
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(void);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct sk_buff *skb);
98 static void ip6_link_failure(struct sk_buff *skb);
99 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103 struct in6_addr *gwaddr, int ifindex,
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106 struct in6_addr *gwaddr, int ifindex);
109 static struct dst_ops ip6_dst_ops = {
111 .protocol = __constant_htons(ETH_P_IPV6),
114 .check = ip6_dst_check,
115 .destroy = ip6_dst_destroy,
116 .ifdown = ip6_dst_ifdown,
117 .negative_advice = ip6_negative_advice,
118 .link_failure = ip6_link_failure,
119 .update_pmtu = ip6_rt_update_pmtu,
120 .entry_size = sizeof(struct rt6_info),
123 struct rt6_info ip6_null_entry = {
126 .__refcnt = ATOMIC_INIT(1),
128 .dev = &loopback_dev,
130 .error = -ENETUNREACH,
131 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
132 .input = ip6_pkt_discard,
133 .output = ip6_pkt_discard_out,
135 .path = (struct dst_entry*)&ip6_null_entry,
138 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
139 .rt6i_metric = ~(u32) 0,
140 .rt6i_ref = ATOMIC_INIT(1),
143 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
145 struct rt6_info ip6_prohibit_entry = {
148 .__refcnt = ATOMIC_INIT(1),
150 .dev = &loopback_dev,
153 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
154 .input = ip6_pkt_discard,
155 .output = ip6_pkt_discard_out,
157 .path = (struct dst_entry*)&ip6_prohibit_entry,
160 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
161 .rt6i_metric = ~(u32) 0,
162 .rt6i_ref = ATOMIC_INIT(1),
165 struct rt6_info ip6_blk_hole_entry = {
168 .__refcnt = ATOMIC_INIT(1),
170 .dev = &loopback_dev,
173 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
174 .input = ip6_pkt_discard,
175 .output = ip6_pkt_discard_out,
177 .path = (struct dst_entry*)&ip6_blk_hole_entry,
180 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
181 .rt6i_metric = ~(u32) 0,
182 .rt6i_ref = ATOMIC_INIT(1),
187 /* allocate dst with ip6_dst_ops */
188 static __inline__ struct rt6_info *ip6_dst_alloc(void)
190 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
193 static void ip6_dst_destroy(struct dst_entry *dst)
195 struct rt6_info *rt = (struct rt6_info *)dst;
196 struct inet6_dev *idev = rt->rt6i_idev;
199 rt->rt6i_idev = NULL;
204 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
207 struct rt6_info *rt = (struct rt6_info *)dst;
208 struct inet6_dev *idev = rt->rt6i_idev;
210 if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
211 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
212 if (loopback_idev != NULL) {
213 rt->rt6i_idev = loopback_idev;
219 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
221 return (rt->rt6i_flags & RTF_EXPIRES &&
222 time_after(jiffies, rt->rt6i_expires));
225 static inline int rt6_need_strict(struct in6_addr *daddr)
227 return (ipv6_addr_type(daddr) &
228 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
232 * Route lookup. Any table->tb6_lock is implied.
235 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
239 struct rt6_info *local = NULL;
240 struct rt6_info *sprt;
243 for (sprt = rt; sprt; sprt = sprt->u.next) {
244 struct net_device *dev = sprt->rt6i_dev;
245 if (dev->ifindex == oif)
247 if (dev->flags & IFF_LOOPBACK) {
248 if (sprt->rt6i_idev == NULL ||
249 sprt->rt6i_idev->dev->ifindex != oif) {
252 if (local && (!oif ||
253 local->rt6i_idev->dev->ifindex == oif))
264 return &ip6_null_entry;
269 #ifdef CONFIG_IPV6_ROUTER_PREF
270 static void rt6_probe(struct rt6_info *rt)
272 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
274 * Okay, this does not seem to be appropriate
275 * for now, however, we need to check if it
276 * is really so; aka Router Reachability Probing.
278 * Router Reachability Probe MUST be rate-limited
279 * to no more than one per minute.
281 if (!neigh || (neigh->nud_state & NUD_VALID))
283 read_lock_bh(&neigh->lock);
284 if (!(neigh->nud_state & NUD_VALID) &&
285 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
286 struct in6_addr mcaddr;
287 struct in6_addr *target;
289 neigh->updated = jiffies;
290 read_unlock_bh(&neigh->lock);
292 target = (struct in6_addr *)&neigh->primary_key;
293 addrconf_addr_solict_mult(target, &mcaddr);
294 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
296 read_unlock_bh(&neigh->lock);
299 static inline void rt6_probe(struct rt6_info *rt)
306 * Default Router Selection (RFC 2461 6.3.6)
308 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
310 struct net_device *dev = rt->rt6i_dev;
311 if (!oif || dev->ifindex == oif)
313 if ((dev->flags & IFF_LOOPBACK) &&
314 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
319 static int inline rt6_check_neigh(struct rt6_info *rt)
321 struct neighbour *neigh = rt->rt6i_nexthop;
323 if (rt->rt6i_flags & RTF_NONEXTHOP ||
324 !(rt->rt6i_flags & RTF_GATEWAY))
327 read_lock_bh(&neigh->lock);
328 if (neigh->nud_state & NUD_VALID)
330 read_unlock_bh(&neigh->lock);
335 static int rt6_score_route(struct rt6_info *rt, int oif,
340 m = rt6_check_dev(rt, oif);
341 if (!m && (strict & RT6_SELECT_F_IFACE))
343 #ifdef CONFIG_IPV6_ROUTER_PREF
344 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
346 n = rt6_check_neigh(rt);
349 else if (!n && strict & RT6_SELECT_F_REACHABLE)
354 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
357 struct rt6_info *match = NULL, *last = NULL;
358 struct rt6_info *rt, *rt0 = *head;
362 RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
363 __FUNCTION__, head, head ? *head : NULL, oif);
365 for (rt = rt0, metric = rt0->rt6i_metric;
366 rt && rt->rt6i_metric == metric && (!last || rt != rt0);
370 if (rt6_check_expired(rt))
375 m = rt6_score_route(rt, oif, strict);
389 (strict & RT6_SELECT_F_REACHABLE) &&
390 last && last != rt0) {
391 /* no entries matched; do round-robin */
392 static DEFINE_SPINLOCK(lock);
395 rt0->u.next = last->u.next;
400 RT6_TRACE("%s() => %p, score=%d\n",
401 __FUNCTION__, match, mpri);
403 return (match ? match : &ip6_null_entry);
406 #ifdef CONFIG_IPV6_ROUTE_INFO
407 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
408 struct in6_addr *gwaddr)
410 struct route_info *rinfo = (struct route_info *) opt;
411 struct in6_addr prefix_buf, *prefix;
416 if (len < sizeof(struct route_info)) {
420 /* Sanity check for prefix_len and length */
421 if (rinfo->length > 3) {
423 } else if (rinfo->prefix_len > 128) {
425 } else if (rinfo->prefix_len > 64) {
426 if (rinfo->length < 2) {
429 } else if (rinfo->prefix_len > 0) {
430 if (rinfo->length < 1) {
435 pref = rinfo->route_pref;
436 if (pref == ICMPV6_ROUTER_PREF_INVALID)
437 pref = ICMPV6_ROUTER_PREF_MEDIUM;
439 lifetime = htonl(rinfo->lifetime);
440 if (lifetime == 0xffffffff) {
442 } else if (lifetime > 0x7fffffff/HZ) {
443 /* Avoid arithmetic overflow */
444 lifetime = 0x7fffffff/HZ - 1;
447 if (rinfo->length == 3)
448 prefix = (struct in6_addr *)rinfo->prefix;
450 /* this function is safe */
451 ipv6_addr_prefix(&prefix_buf,
452 (struct in6_addr *)rinfo->prefix,
454 prefix = &prefix_buf;
457 rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
459 if (rt && !lifetime) {
465 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
468 rt->rt6i_flags = RTF_ROUTEINFO |
469 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
472 if (lifetime == 0xffffffff) {
473 rt->rt6i_flags &= ~RTF_EXPIRES;
475 rt->rt6i_expires = jiffies + HZ * lifetime;
476 rt->rt6i_flags |= RTF_EXPIRES;
478 dst_release(&rt->u.dst);
484 #define BACKTRACK(saddr) \
486 if (rt == &ip6_null_entry) { \
487 struct fib6_node *pn; \
489 if (fn->fn_flags & RTN_TL_ROOT) \
492 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
493 fn = fib6_lookup(pn->subtree, NULL, saddr); \
496 if (fn->fn_flags & RTN_RTINFO) \
502 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
503 struct flowi *fl, int flags)
505 struct fib6_node *fn;
508 read_lock_bh(&table->tb6_lock);
509 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
512 rt = rt6_device_match(rt, fl->oif, flags & RT6_F_STRICT);
513 BACKTRACK(&fl->fl6_src);
514 dst_hold(&rt->u.dst);
516 read_unlock_bh(&table->tb6_lock);
518 rt->u.dst.lastuse = jiffies;
525 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
537 struct dst_entry *dst;
538 int flags = strict ? RT6_F_STRICT : 0;
540 dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
542 return (struct rt6_info *) dst;
549 /* ip6_ins_rt is called with FREE table->tb6_lock.
550 It takes new route entry, the addition fails by any reason the
551 route is freed. In any case, if caller does not hold it, it may
555 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
558 struct fib6_table *table;
560 table = rt->rt6i_table;
561 write_lock_bh(&table->tb6_lock);
562 err = fib6_add(&table->tb6_root, rt, info);
563 write_unlock_bh(&table->tb6_lock);
568 int ip6_ins_rt(struct rt6_info *rt)
570 return __ip6_ins_rt(rt, NULL);
573 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
574 struct in6_addr *saddr)
582 rt = ip6_rt_copy(ort);
585 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
586 if (rt->rt6i_dst.plen != 128 &&
587 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
588 rt->rt6i_flags |= RTF_ANYCAST;
589 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
592 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
593 rt->rt6i_dst.plen = 128;
594 rt->rt6i_flags |= RTF_CACHE;
595 rt->u.dst.flags |= DST_HOST;
597 #ifdef CONFIG_IPV6_SUBTREES
598 if (rt->rt6i_src.plen && saddr) {
599 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
600 rt->rt6i_src.plen = 128;
604 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
611 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
613 struct rt6_info *rt = ip6_rt_copy(ort);
615 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
616 rt->rt6i_dst.plen = 128;
617 rt->rt6i_flags |= RTF_CACHE;
618 if (rt->rt6i_flags & RTF_REJECT)
619 rt->u.dst.error = ort->u.dst.error;
620 rt->u.dst.flags |= DST_HOST;
621 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
626 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
627 struct flowi *fl, int flags)
629 struct fib6_node *fn;
630 struct rt6_info *rt, *nrt;
634 int reachable = RT6_SELECT_F_REACHABLE;
636 if (flags & RT6_F_STRICT)
637 strict = RT6_SELECT_F_IFACE;
640 read_lock_bh(&table->tb6_lock);
643 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
646 rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
647 BACKTRACK(&fl->fl6_src);
648 if (rt == &ip6_null_entry ||
649 rt->rt6i_flags & RTF_CACHE)
652 dst_hold(&rt->u.dst);
653 read_unlock_bh(&table->tb6_lock);
655 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
656 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
658 #if CLONE_OFFLINK_ROUTE
659 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
665 dst_release(&rt->u.dst);
666 rt = nrt ? : &ip6_null_entry;
668 dst_hold(&rt->u.dst);
670 err = ip6_ins_rt(nrt);
679 * Race condition! In the gap, when table->tb6_lock was
680 * released someone could insert this route. Relookup.
682 dst_release(&rt->u.dst);
690 dst_hold(&rt->u.dst);
691 read_unlock_bh(&table->tb6_lock);
693 rt->u.dst.lastuse = jiffies;
699 void ip6_route_input(struct sk_buff *skb)
701 struct ipv6hdr *iph = skb->nh.ipv6h;
703 .iif = skb->dev->ifindex,
708 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
711 .proto = iph->nexthdr,
715 if (rt6_need_strict(&iph->daddr))
716 flags |= RT6_F_STRICT;
718 skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
721 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
722 struct flowi *fl, int flags)
724 struct fib6_node *fn;
725 struct rt6_info *rt, *nrt;
729 int reachable = RT6_SELECT_F_REACHABLE;
731 if (flags & RT6_F_STRICT)
732 strict = RT6_SELECT_F_IFACE;
735 read_lock_bh(&table->tb6_lock);
738 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
741 rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
742 BACKTRACK(&fl->fl6_src);
743 if (rt == &ip6_null_entry ||
744 rt->rt6i_flags & RTF_CACHE)
747 dst_hold(&rt->u.dst);
748 read_unlock_bh(&table->tb6_lock);
750 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
751 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
753 #if CLONE_OFFLINK_ROUTE
754 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
760 dst_release(&rt->u.dst);
761 rt = nrt ? : &ip6_null_entry;
763 dst_hold(&rt->u.dst);
765 err = ip6_ins_rt(nrt);
774 * Race condition! In the gap, when table->tb6_lock was
775 * released someone could insert this route. Relookup.
777 dst_release(&rt->u.dst);
785 dst_hold(&rt->u.dst);
786 read_unlock_bh(&table->tb6_lock);
788 rt->u.dst.lastuse = jiffies;
793 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
797 if (rt6_need_strict(&fl->fl6_dst))
798 flags |= RT6_F_STRICT;
800 return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
805 * Destination cache support functions
808 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
812 rt = (struct rt6_info *) dst;
814 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
820 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
822 struct rt6_info *rt = (struct rt6_info *) dst;
825 if (rt->rt6i_flags & RTF_CACHE)
833 static void ip6_link_failure(struct sk_buff *skb)
837 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
839 rt = (struct rt6_info *) skb->dst;
841 if (rt->rt6i_flags&RTF_CACHE) {
842 dst_set_expires(&rt->u.dst, 0);
843 rt->rt6i_flags |= RTF_EXPIRES;
844 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
845 rt->rt6i_node->fn_sernum = -1;
849 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
851 struct rt6_info *rt6 = (struct rt6_info*)dst;
853 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
854 rt6->rt6i_flags |= RTF_MODIFIED;
855 if (mtu < IPV6_MIN_MTU) {
857 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
859 dst->metrics[RTAX_MTU-1] = mtu;
860 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
864 static int ipv6_get_mtu(struct net_device *dev);
866 static inline unsigned int ipv6_advmss(unsigned int mtu)
868 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
870 if (mtu < ip6_rt_min_advmss)
871 mtu = ip6_rt_min_advmss;
874 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
875 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
876 * IPV6_MAXPLEN is also valid and means: "any MSS,
877 * rely only on pmtu discovery"
879 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
884 static struct dst_entry *ndisc_dst_gc_list;
885 static DEFINE_SPINLOCK(ndisc_lock);
887 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
888 struct neighbour *neigh,
889 struct in6_addr *addr,
890 int (*output)(struct sk_buff *))
893 struct inet6_dev *idev = in6_dev_get(dev);
895 if (unlikely(idev == NULL))
898 rt = ip6_dst_alloc();
899 if (unlikely(rt == NULL)) {
908 neigh = ndisc_get_neigh(dev, addr);
911 rt->rt6i_idev = idev;
912 rt->rt6i_nexthop = neigh;
913 atomic_set(&rt->u.dst.__refcnt, 1);
914 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
915 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
916 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
917 rt->u.dst.output = output;
919 #if 0 /* there's no chance to use these for ndisc */
920 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
923 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
924 rt->rt6i_dst.plen = 128;
927 spin_lock_bh(&ndisc_lock);
928 rt->u.dst.next = ndisc_dst_gc_list;
929 ndisc_dst_gc_list = &rt->u.dst;
930 spin_unlock_bh(&ndisc_lock);
932 fib6_force_start_gc();
935 return (struct dst_entry *)rt;
938 int ndisc_dst_gc(int *more)
940 struct dst_entry *dst, *next, **pprev;
946 spin_lock_bh(&ndisc_lock);
947 pprev = &ndisc_dst_gc_list;
949 while ((dst = *pprev) != NULL) {
950 if (!atomic_read(&dst->__refcnt)) {
960 spin_unlock_bh(&ndisc_lock);
965 static int ip6_dst_gc(void)
967 static unsigned expire = 30*HZ;
968 static unsigned long last_gc;
969 unsigned long now = jiffies;
971 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
972 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
978 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
979 expire = ip6_rt_gc_timeout>>1;
982 expire -= expire>>ip6_rt_gc_elasticity;
983 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
986 /* Clean host part of a prefix. Not necessary in radix tree,
987 but results in cleaner routing tables.
989 Remove it only when all the things will work!
992 static int ipv6_get_mtu(struct net_device *dev)
994 int mtu = IPV6_MIN_MTU;
995 struct inet6_dev *idev;
997 idev = in6_dev_get(dev);
999 mtu = idev->cnf.mtu6;
1005 int ipv6_get_hoplimit(struct net_device *dev)
1007 int hoplimit = ipv6_devconf.hop_limit;
1008 struct inet6_dev *idev;
1010 idev = in6_dev_get(dev);
1012 hoplimit = idev->cnf.hop_limit;
1022 int ip6_route_add(struct fib6_config *cfg)
1025 struct rt6_info *rt = NULL;
1026 struct net_device *dev = NULL;
1027 struct inet6_dev *idev = NULL;
1028 struct fib6_table *table;
1031 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1033 #ifndef CONFIG_IPV6_SUBTREES
1034 if (cfg->fc_src_len)
1037 if (cfg->fc_ifindex) {
1039 dev = dev_get_by_index(cfg->fc_ifindex);
1042 idev = in6_dev_get(dev);
1047 if (cfg->fc_metric == 0)
1048 cfg->fc_metric = IP6_RT_PRIO_USER;
1050 table = fib6_new_table(cfg->fc_table);
1051 if (table == NULL) {
1056 rt = ip6_dst_alloc();
1063 rt->u.dst.obsolete = -1;
1064 rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1066 if (cfg->fc_protocol == RTPROT_UNSPEC)
1067 cfg->fc_protocol = RTPROT_BOOT;
1068 rt->rt6i_protocol = cfg->fc_protocol;
1070 addr_type = ipv6_addr_type(&cfg->fc_dst);
1072 if (addr_type & IPV6_ADDR_MULTICAST)
1073 rt->u.dst.input = ip6_mc_input;
1075 rt->u.dst.input = ip6_forward;
1077 rt->u.dst.output = ip6_output;
1079 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1080 rt->rt6i_dst.plen = cfg->fc_dst_len;
1081 if (rt->rt6i_dst.plen == 128)
1082 rt->u.dst.flags = DST_HOST;
1084 #ifdef CONFIG_IPV6_SUBTREES
1085 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1086 rt->rt6i_src.plen = cfg->fc_src_len;
1089 rt->rt6i_metric = cfg->fc_metric;
1091 /* We cannot add true routes via loopback here,
1092 they would result in kernel looping; promote them to reject routes
1094 if ((cfg->fc_flags & RTF_REJECT) ||
1095 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1096 /* hold loopback dev/idev if we haven't done so. */
1097 if (dev != &loopback_dev) {
1102 dev = &loopback_dev;
1104 idev = in6_dev_get(dev);
1110 rt->u.dst.output = ip6_pkt_discard_out;
1111 rt->u.dst.input = ip6_pkt_discard;
1112 rt->u.dst.error = -ENETUNREACH;
1113 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1117 if (cfg->fc_flags & RTF_GATEWAY) {
1118 struct in6_addr *gw_addr;
1121 gw_addr = &cfg->fc_gateway;
1122 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1123 gwa_type = ipv6_addr_type(gw_addr);
1125 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1126 struct rt6_info *grt;
1128 /* IPv6 strictly inhibits using not link-local
1129 addresses as nexthop address.
1130 Otherwise, router will not able to send redirects.
1131 It is very good, but in some (rare!) circumstances
1132 (SIT, PtP, NBMA NOARP links) it is handy to allow
1133 some exceptions. --ANK
1136 if (!(gwa_type&IPV6_ADDR_UNICAST))
1139 grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1141 err = -EHOSTUNREACH;
1145 if (dev != grt->rt6i_dev) {
1146 dst_release(&grt->u.dst);
1150 dev = grt->rt6i_dev;
1151 idev = grt->rt6i_idev;
1153 in6_dev_hold(grt->rt6i_idev);
1155 if (!(grt->rt6i_flags&RTF_GATEWAY))
1157 dst_release(&grt->u.dst);
1163 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1171 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1172 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1173 if (IS_ERR(rt->rt6i_nexthop)) {
1174 err = PTR_ERR(rt->rt6i_nexthop);
1175 rt->rt6i_nexthop = NULL;
1180 rt->rt6i_flags = cfg->fc_flags;
1187 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1188 int type = nla->nla_type;
1191 if (type > RTAX_MAX) {
1196 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1201 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1202 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1203 if (!rt->u.dst.metrics[RTAX_MTU-1])
1204 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1205 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1206 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1207 rt->u.dst.dev = dev;
1208 rt->rt6i_idev = idev;
1209 rt->rt6i_table = table;
1210 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1218 dst_free((struct dst_entry *) rt);
1222 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1225 struct fib6_table *table;
1227 if (rt == &ip6_null_entry)
1230 table = rt->rt6i_table;
1231 write_lock_bh(&table->tb6_lock);
1233 err = fib6_del(rt, info);
1234 dst_release(&rt->u.dst);
1236 write_unlock_bh(&table->tb6_lock);
1241 int ip6_del_rt(struct rt6_info *rt)
1243 return __ip6_del_rt(rt, NULL);
1246 static int ip6_route_del(struct fib6_config *cfg)
1248 struct fib6_table *table;
1249 struct fib6_node *fn;
1250 struct rt6_info *rt;
1253 table = fib6_get_table(cfg->fc_table);
1257 read_lock_bh(&table->tb6_lock);
1259 fn = fib6_locate(&table->tb6_root,
1260 &cfg->fc_dst, cfg->fc_dst_len,
1261 &cfg->fc_src, cfg->fc_src_len);
1264 for (rt = fn->leaf; rt; rt = rt->u.next) {
1265 if (cfg->fc_ifindex &&
1266 (rt->rt6i_dev == NULL ||
1267 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1269 if (cfg->fc_flags & RTF_GATEWAY &&
1270 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1272 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1274 dst_hold(&rt->u.dst);
1275 read_unlock_bh(&table->tb6_lock);
1277 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1280 read_unlock_bh(&table->tb6_lock);
1288 struct ip6rd_flowi {
1290 struct in6_addr gateway;
1293 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1297 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1298 struct rt6_info *rt;
1299 struct fib6_node *fn;
1302 * Get the "current" route for this destination and
1303 * check if the redirect has come from approriate router.
1305 * RFC 2461 specifies that redirects should only be
1306 * accepted if they come from the nexthop to the target.
1307 * Due to the way the routes are chosen, this notion
1308 * is a bit fuzzy and one might need to check all possible
1312 read_lock_bh(&table->tb6_lock);
1313 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1315 for (rt = fn->leaf; rt; rt = rt->u.next) {
1317 * Current route is on-link; redirect is always invalid.
1319 * Seems, previous statement is not true. It could
1320 * be node, which looks for us as on-link (f.e. proxy ndisc)
1321 * But then router serving it might decide, that we should
1322 * know truth 8)8) --ANK (980726).
1324 if (rt6_check_expired(rt))
1326 if (!(rt->rt6i_flags & RTF_GATEWAY))
1328 if (fl->oif != rt->rt6i_dev->ifindex)
1330 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1336 if (rt6_need_strict(&fl->fl6_dst)) {
1337 while ((fn = fn->parent) != NULL) {
1338 if (fn->fn_flags & RTN_ROOT)
1340 if (fn->fn_flags & RTN_RTINFO)
1344 rt = &ip6_null_entry;
1346 dst_hold(&rt->u.dst);
1348 read_unlock_bh(&table->tb6_lock);
1353 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1354 struct in6_addr *src,
1355 struct in6_addr *gateway,
1356 struct net_device *dev)
1358 struct ip6rd_flowi rdfl = {
1360 .oif = dev->ifindex,
1368 .gateway = *gateway,
1370 int flags = rt6_need_strict(dest) ? RT6_F_STRICT : 0;
1372 return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1375 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1376 struct in6_addr *saddr,
1377 struct neighbour *neigh, u8 *lladdr, int on_link)
1379 struct rt6_info *rt, *nrt = NULL;
1380 struct netevent_redirect netevent;
1382 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1384 if (rt == &ip6_null_entry) {
1385 if (net_ratelimit())
1386 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1387 "for redirect target\n");
1392 * We have finally decided to accept it.
1395 neigh_update(neigh, lladdr, NUD_STALE,
1396 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1397 NEIGH_UPDATE_F_OVERRIDE|
1398 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1399 NEIGH_UPDATE_F_ISROUTER))
1403 * Redirect received -> path was valid.
1404 * Look, redirects are sent only in response to data packets,
1405 * so that this nexthop apparently is reachable. --ANK
1407 dst_confirm(&rt->u.dst);
1409 /* Duplicate redirect: silently ignore. */
1410 if (neigh == rt->u.dst.neighbour)
1413 nrt = ip6_rt_copy(rt);
1417 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1419 nrt->rt6i_flags &= ~RTF_GATEWAY;
1421 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1422 nrt->rt6i_dst.plen = 128;
1423 nrt->u.dst.flags |= DST_HOST;
1425 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1426 nrt->rt6i_nexthop = neigh_clone(neigh);
1427 /* Reset pmtu, it may be better */
1428 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1429 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1431 if (ip6_ins_rt(nrt))
1434 netevent.old = &rt->u.dst;
1435 netevent.new = &nrt->u.dst;
1436 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1438 if (rt->rt6i_flags&RTF_CACHE) {
1444 dst_release(&rt->u.dst);
1449 * Handle ICMP "packet too big" messages
1450 * i.e. Path MTU discovery
1453 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1454 struct net_device *dev, u32 pmtu)
1456 struct rt6_info *rt, *nrt;
1459 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1463 if (pmtu >= dst_mtu(&rt->u.dst))
1466 if (pmtu < IPV6_MIN_MTU) {
1468 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1469 * MTU (1280) and a fragment header should always be included
1470 * after a node receiving Too Big message reporting PMTU is
1471 * less than the IPv6 Minimum Link MTU.
1473 pmtu = IPV6_MIN_MTU;
1477 /* New mtu received -> path was valid.
1478 They are sent only in response to data packets,
1479 so that this nexthop apparently is reachable. --ANK
1481 dst_confirm(&rt->u.dst);
1483 /* Host route. If it is static, it would be better
1484 not to override it, but add new one, so that
1485 when cache entry will expire old pmtu
1486 would return automatically.
1488 if (rt->rt6i_flags & RTF_CACHE) {
1489 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1491 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1492 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1493 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1498 Two cases are possible:
1499 1. It is connected route. Action: COW
1500 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1502 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1503 nrt = rt6_alloc_cow(rt, daddr, saddr);
1505 nrt = rt6_alloc_clone(rt, daddr);
1508 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1510 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1512 /* According to RFC 1981, detecting PMTU increase shouldn't be
1513 * happened within 5 mins, the recommended timer is 10 mins.
1514 * Here this route expiration time is set to ip6_rt_mtu_expires
1515 * which is 10 mins. After 10 mins the decreased pmtu is expired
1516 * and detecting PMTU increase will be automatically happened.
1518 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1519 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1524 dst_release(&rt->u.dst);
1528 * Misc support functions
1531 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1533 struct rt6_info *rt = ip6_dst_alloc();
1536 rt->u.dst.input = ort->u.dst.input;
1537 rt->u.dst.output = ort->u.dst.output;
1539 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1540 rt->u.dst.dev = ort->u.dst.dev;
1542 dev_hold(rt->u.dst.dev);
1543 rt->rt6i_idev = ort->rt6i_idev;
1545 in6_dev_hold(rt->rt6i_idev);
1546 rt->u.dst.lastuse = jiffies;
1547 rt->rt6i_expires = 0;
1549 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1550 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1551 rt->rt6i_metric = 0;
1553 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1554 #ifdef CONFIG_IPV6_SUBTREES
1555 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1557 rt->rt6i_table = ort->rt6i_table;
1562 #ifdef CONFIG_IPV6_ROUTE_INFO
1563 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1564 struct in6_addr *gwaddr, int ifindex)
1566 struct fib6_node *fn;
1567 struct rt6_info *rt = NULL;
1568 struct fib6_table *table;
1570 table = fib6_get_table(RT6_TABLE_INFO);
1574 write_lock_bh(&table->tb6_lock);
1575 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1579 for (rt = fn->leaf; rt; rt = rt->u.next) {
1580 if (rt->rt6i_dev->ifindex != ifindex)
1582 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1584 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1586 dst_hold(&rt->u.dst);
1590 write_unlock_bh(&table->tb6_lock);
1594 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1595 struct in6_addr *gwaddr, int ifindex,
1598 struct fib6_config cfg = {
1599 .fc_table = RT6_TABLE_INFO,
1601 .fc_ifindex = ifindex,
1602 .fc_dst_len = prefixlen,
1603 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1604 RTF_UP | RTF_PREF(pref),
1607 ipv6_addr_copy(&cfg.fc_dst, prefix);
1608 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1610 /* We should treat it as a default route if prefix length is 0. */
1612 cfg.fc_flags |= RTF_DEFAULT;
1614 ip6_route_add(&cfg);
1616 return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1620 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1622 struct rt6_info *rt;
1623 struct fib6_table *table;
1625 table = fib6_get_table(RT6_TABLE_DFLT);
1629 write_lock_bh(&table->tb6_lock);
1630 for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1631 if (dev == rt->rt6i_dev &&
1632 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1633 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1637 dst_hold(&rt->u.dst);
1638 write_unlock_bh(&table->tb6_lock);
1642 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1643 struct net_device *dev,
1646 struct fib6_config cfg = {
1647 .fc_table = RT6_TABLE_DFLT,
1649 .fc_ifindex = dev->ifindex,
1650 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1651 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1654 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1656 ip6_route_add(&cfg);
1658 return rt6_get_dflt_router(gwaddr, dev);
1661 void rt6_purge_dflt_routers(void)
1663 struct rt6_info *rt;
1664 struct fib6_table *table;
1666 /* NOTE: Keep consistent with rt6_get_dflt_router */
1667 table = fib6_get_table(RT6_TABLE_DFLT);
1672 read_lock_bh(&table->tb6_lock);
1673 for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1674 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1675 dst_hold(&rt->u.dst);
1676 read_unlock_bh(&table->tb6_lock);
1681 read_unlock_bh(&table->tb6_lock);
1684 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1685 struct fib6_config *cfg)
1687 memset(cfg, 0, sizeof(*cfg));
1689 cfg->fc_table = RT6_TABLE_MAIN;
1690 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1691 cfg->fc_metric = rtmsg->rtmsg_metric;
1692 cfg->fc_expires = rtmsg->rtmsg_info;
1693 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1694 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1695 cfg->fc_flags = rtmsg->rtmsg_flags;
1697 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1698 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1699 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1702 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1704 struct fib6_config cfg;
1705 struct in6_rtmsg rtmsg;
1709 case SIOCADDRT: /* Add a route */
1710 case SIOCDELRT: /* Delete a route */
1711 if (!capable(CAP_NET_ADMIN))
1713 err = copy_from_user(&rtmsg, arg,
1714 sizeof(struct in6_rtmsg));
1718 rtmsg_to_fib6_config(&rtmsg, &cfg);
1723 err = ip6_route_add(&cfg);
1726 err = ip6_route_del(&cfg);
1740 * Drop the packet on the floor
1743 static int ip6_pkt_discard(struct sk_buff *skb)
1745 int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1746 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1747 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1749 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1750 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1755 static int ip6_pkt_discard_out(struct sk_buff *skb)
1757 skb->dev = skb->dst->dev;
1758 return ip6_pkt_discard(skb);
1762 * Allocate a dst for local (unicast / anycast) address.
1765 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1766 const struct in6_addr *addr,
1769 struct rt6_info *rt = ip6_dst_alloc();
1772 return ERR_PTR(-ENOMEM);
1774 dev_hold(&loopback_dev);
1777 rt->u.dst.flags = DST_HOST;
1778 rt->u.dst.input = ip6_input;
1779 rt->u.dst.output = ip6_output;
1780 rt->rt6i_dev = &loopback_dev;
1781 rt->rt6i_idev = idev;
1782 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1783 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1784 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1785 rt->u.dst.obsolete = -1;
1787 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1789 rt->rt6i_flags |= RTF_ANYCAST;
1791 rt->rt6i_flags |= RTF_LOCAL;
1792 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1793 if (rt->rt6i_nexthop == NULL) {
1794 dst_free((struct dst_entry *) rt);
1795 return ERR_PTR(-ENOMEM);
1798 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1799 rt->rt6i_dst.plen = 128;
1800 rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1802 atomic_set(&rt->u.dst.__refcnt, 1);
1807 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1809 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1810 rt != &ip6_null_entry) {
1811 RT6_TRACE("deleted by ifdown %p\n", rt);
1817 void rt6_ifdown(struct net_device *dev)
1819 fib6_clean_all(fib6_ifdown, 0, dev);
1822 struct rt6_mtu_change_arg
1824 struct net_device *dev;
1828 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1830 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1831 struct inet6_dev *idev;
1833 /* In IPv6 pmtu discovery is not optional,
1834 so that RTAX_MTU lock cannot disable it.
1835 We still use this lock to block changes
1836 caused by addrconf/ndisc.
1839 idev = __in6_dev_get(arg->dev);
1843 /* For administrative MTU increase, there is no way to discover
1844 IPv6 PMTU increase, so PMTU increase should be updated here.
1845 Since RFC 1981 doesn't include administrative MTU increase
1846 update PMTU increase is a MUST. (i.e. jumbo frame)
1849 If new MTU is less than route PMTU, this new MTU will be the
1850 lowest MTU in the path, update the route PMTU to reflect PMTU
1851 decreases; if new MTU is greater than route PMTU, and the
1852 old MTU is the lowest MTU in the path, update the route PMTU
1853 to reflect the increase. In this case if the other nodes' MTU
1854 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1857 if (rt->rt6i_dev == arg->dev &&
1858 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1859 (dst_mtu(&rt->u.dst) > arg->mtu ||
1860 (dst_mtu(&rt->u.dst) < arg->mtu &&
1861 dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1862 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1863 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1867 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1869 struct rt6_mtu_change_arg arg = {
1874 fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1877 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1878 [RTA_GATEWAY] = { .minlen = sizeof(struct in6_addr) },
1879 [RTA_OIF] = { .type = NLA_U32 },
1880 [RTA_IIF] = { .type = NLA_U32 },
1881 [RTA_PRIORITY] = { .type = NLA_U32 },
1882 [RTA_METRICS] = { .type = NLA_NESTED },
1885 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1886 struct fib6_config *cfg)
1889 struct nlattr *tb[RTA_MAX+1];
1892 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1897 rtm = nlmsg_data(nlh);
1898 memset(cfg, 0, sizeof(*cfg));
1900 cfg->fc_table = rtm->rtm_table;
1901 cfg->fc_dst_len = rtm->rtm_dst_len;
1902 cfg->fc_src_len = rtm->rtm_src_len;
1903 cfg->fc_flags = RTF_UP;
1904 cfg->fc_protocol = rtm->rtm_protocol;
1906 if (rtm->rtm_type == RTN_UNREACHABLE)
1907 cfg->fc_flags |= RTF_REJECT;
1909 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1910 cfg->fc_nlinfo.nlh = nlh;
1912 if (tb[RTA_GATEWAY]) {
1913 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1914 cfg->fc_flags |= RTF_GATEWAY;
1918 int plen = (rtm->rtm_dst_len + 7) >> 3;
1920 if (nla_len(tb[RTA_DST]) < plen)
1923 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1927 int plen = (rtm->rtm_src_len + 7) >> 3;
1929 if (nla_len(tb[RTA_SRC]) < plen)
1932 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1936 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1938 if (tb[RTA_PRIORITY])
1939 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1941 if (tb[RTA_METRICS]) {
1942 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1943 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1947 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1954 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1956 struct fib6_config cfg;
1959 err = rtm_to_fib6_config(skb, nlh, &cfg);
1963 return ip6_route_del(&cfg);
1966 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1968 struct fib6_config cfg;
1971 err = rtm_to_fib6_config(skb, nlh, &cfg);
1975 return ip6_route_add(&cfg);
1978 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1979 struct in6_addr *dst, struct in6_addr *src,
1980 int iif, int type, u32 pid, u32 seq,
1981 int prefix, unsigned int flags)
1984 struct nlmsghdr *nlh;
1985 struct rta_cacheinfo ci;
1988 if (prefix) { /* user wants prefix routes only */
1989 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1990 /* success since this is not a prefix route */
1995 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
1999 rtm = nlmsg_data(nlh);
2000 rtm->rtm_family = AF_INET6;
2001 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2002 rtm->rtm_src_len = rt->rt6i_src.plen;
2005 table = rt->rt6i_table->tb6_id;
2007 table = RT6_TABLE_UNSPEC;
2008 rtm->rtm_table = table;
2009 NLA_PUT_U32(skb, RTA_TABLE, table);
2010 if (rt->rt6i_flags&RTF_REJECT)
2011 rtm->rtm_type = RTN_UNREACHABLE;
2012 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2013 rtm->rtm_type = RTN_LOCAL;
2015 rtm->rtm_type = RTN_UNICAST;
2017 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2018 rtm->rtm_protocol = rt->rt6i_protocol;
2019 if (rt->rt6i_flags&RTF_DYNAMIC)
2020 rtm->rtm_protocol = RTPROT_REDIRECT;
2021 else if (rt->rt6i_flags & RTF_ADDRCONF)
2022 rtm->rtm_protocol = RTPROT_KERNEL;
2023 else if (rt->rt6i_flags&RTF_DEFAULT)
2024 rtm->rtm_protocol = RTPROT_RA;
2026 if (rt->rt6i_flags&RTF_CACHE)
2027 rtm->rtm_flags |= RTM_F_CLONED;
2030 NLA_PUT(skb, RTA_DST, 16, dst);
2031 rtm->rtm_dst_len = 128;
2032 } else if (rtm->rtm_dst_len)
2033 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2034 #ifdef CONFIG_IPV6_SUBTREES
2036 NLA_PUT(skb, RTA_SRC, 16, src);
2037 rtm->rtm_src_len = 128;
2038 } else if (rtm->rtm_src_len)
2039 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2042 NLA_PUT_U32(skb, RTA_IIF, iif);
2044 struct in6_addr saddr_buf;
2045 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2046 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2049 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2050 goto nla_put_failure;
2052 if (rt->u.dst.neighbour)
2053 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2056 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2058 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2059 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2060 if (rt->rt6i_expires)
2061 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
2064 ci.rta_used = rt->u.dst.__use;
2065 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2066 ci.rta_error = rt->u.dst.error;
2070 NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2072 return nlmsg_end(skb, nlh);
2075 return nlmsg_cancel(skb, nlh);
2078 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2080 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2083 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2084 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2085 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2089 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2090 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2091 prefix, NLM_F_MULTI);
2094 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2096 struct nlattr *tb[RTA_MAX+1];
2097 struct rt6_info *rt;
2098 struct sk_buff *skb;
2103 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2108 memset(&fl, 0, sizeof(fl));
2111 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2114 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2118 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2121 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2125 iif = nla_get_u32(tb[RTA_IIF]);
2128 fl.oif = nla_get_u32(tb[RTA_OIF]);
2131 struct net_device *dev;
2132 dev = __dev_get_by_index(iif);
2139 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2145 /* Reserve room for dummy headers, this skb can pass
2146 through good chunk of routing engine.
2148 skb->mac.raw = skb->data;
2149 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2151 rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2152 skb->dst = &rt->u.dst;
2154 err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2155 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2156 nlh->nlmsg_seq, 0, 0);
2162 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2167 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2169 struct sk_buff *skb;
2170 u32 pid = 0, seq = 0;
2171 struct nlmsghdr *nlh = NULL;
2172 int payload = sizeof(struct rtmsg) + 256;
2179 seq = nlh->nlmsg_seq;
2182 skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2186 err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2192 err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2195 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2202 #ifdef CONFIG_PROC_FS
2204 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2215 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2217 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2220 if (arg->skip < arg->offset / RT6_INFO_LEN) {
2225 if (arg->len >= arg->length)
2228 for (i=0; i<16; i++) {
2229 sprintf(arg->buffer + arg->len, "%02x",
2230 rt->rt6i_dst.addr.s6_addr[i]);
2233 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2236 #ifdef CONFIG_IPV6_SUBTREES
2237 for (i=0; i<16; i++) {
2238 sprintf(arg->buffer + arg->len, "%02x",
2239 rt->rt6i_src.addr.s6_addr[i]);
2242 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2245 sprintf(arg->buffer + arg->len,
2246 "00000000000000000000000000000000 00 ");
2250 if (rt->rt6i_nexthop) {
2251 for (i=0; i<16; i++) {
2252 sprintf(arg->buffer + arg->len, "%02x",
2253 rt->rt6i_nexthop->primary_key[i]);
2257 sprintf(arg->buffer + arg->len,
2258 "00000000000000000000000000000000");
2261 arg->len += sprintf(arg->buffer + arg->len,
2262 " %08x %08x %08x %08x %8s\n",
2263 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2264 rt->u.dst.__use, rt->rt6i_flags,
2265 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2269 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2271 struct rt6_proc_arg arg = {
2277 fib6_clean_all(rt6_info_route, 0, &arg);
2281 *start += offset % RT6_INFO_LEN;
2283 arg.len -= offset % RT6_INFO_LEN;
2285 if (arg.len > length)
2293 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2295 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2296 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2297 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2298 rt6_stats.fib_rt_cache,
2299 atomic_read(&ip6_dst_ops.entries),
2300 rt6_stats.fib_discarded_routes);
2305 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2307 return single_open(file, rt6_stats_seq_show, NULL);
2310 static struct file_operations rt6_stats_seq_fops = {
2311 .owner = THIS_MODULE,
2312 .open = rt6_stats_seq_open,
2314 .llseek = seq_lseek,
2315 .release = single_release,
2317 #endif /* CONFIG_PROC_FS */
2319 #ifdef CONFIG_SYSCTL
2321 static int flush_delay;
2324 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2325 void __user *buffer, size_t *lenp, loff_t *ppos)
2328 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2329 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2335 ctl_table ipv6_route_table[] = {
2337 .ctl_name = NET_IPV6_ROUTE_FLUSH,
2338 .procname = "flush",
2339 .data = &flush_delay,
2340 .maxlen = sizeof(int),
2342 .proc_handler = &ipv6_sysctl_rtcache_flush
2345 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
2346 .procname = "gc_thresh",
2347 .data = &ip6_dst_ops.gc_thresh,
2348 .maxlen = sizeof(int),
2350 .proc_handler = &proc_dointvec,
2353 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
2354 .procname = "max_size",
2355 .data = &ip6_rt_max_size,
2356 .maxlen = sizeof(int),
2358 .proc_handler = &proc_dointvec,
2361 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2362 .procname = "gc_min_interval",
2363 .data = &ip6_rt_gc_min_interval,
2364 .maxlen = sizeof(int),
2366 .proc_handler = &proc_dointvec_jiffies,
2367 .strategy = &sysctl_jiffies,
2370 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
2371 .procname = "gc_timeout",
2372 .data = &ip6_rt_gc_timeout,
2373 .maxlen = sizeof(int),
2375 .proc_handler = &proc_dointvec_jiffies,
2376 .strategy = &sysctl_jiffies,
2379 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2380 .procname = "gc_interval",
2381 .data = &ip6_rt_gc_interval,
2382 .maxlen = sizeof(int),
2384 .proc_handler = &proc_dointvec_jiffies,
2385 .strategy = &sysctl_jiffies,
2388 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2389 .procname = "gc_elasticity",
2390 .data = &ip6_rt_gc_elasticity,
2391 .maxlen = sizeof(int),
2393 .proc_handler = &proc_dointvec_jiffies,
2394 .strategy = &sysctl_jiffies,
2397 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2398 .procname = "mtu_expires",
2399 .data = &ip6_rt_mtu_expires,
2400 .maxlen = sizeof(int),
2402 .proc_handler = &proc_dointvec_jiffies,
2403 .strategy = &sysctl_jiffies,
2406 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2407 .procname = "min_adv_mss",
2408 .data = &ip6_rt_min_advmss,
2409 .maxlen = sizeof(int),
2411 .proc_handler = &proc_dointvec_jiffies,
2412 .strategy = &sysctl_jiffies,
2415 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2416 .procname = "gc_min_interval_ms",
2417 .data = &ip6_rt_gc_min_interval,
2418 .maxlen = sizeof(int),
2420 .proc_handler = &proc_dointvec_ms_jiffies,
2421 .strategy = &sysctl_ms_jiffies,
2428 void __init ip6_route_init(void)
2430 struct proc_dir_entry *p;
2432 ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2433 sizeof(struct rt6_info),
2434 0, SLAB_HWCACHE_ALIGN,
2436 if (!ip6_dst_ops.kmem_cachep)
2437 panic("cannot create ip6_dst_cache");
2440 #ifdef CONFIG_PROC_FS
2441 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2443 p->owner = THIS_MODULE;
2445 proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2450 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2455 void ip6_route_cleanup(void)
2457 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2458 fib6_rules_cleanup();
2460 #ifdef CONFIG_PROC_FS
2461 proc_net_remove("ipv6_route");
2462 proc_net_remove("rt6_stats");
2469 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);