2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
53 #include <linux/rtnetlink.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
59 #include <asm/uaccess.h>
62 #include <linux/sysctl.h>
65 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
66 const struct in6_addr *dest);
67 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
68 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
69 static unsigned int ip6_mtu(const struct dst_entry *dst);
70 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
71 static void ip6_dst_destroy(struct dst_entry *);
72 static void ip6_dst_ifdown(struct dst_entry *,
73 struct net_device *dev, int how);
74 static int ip6_dst_gc(struct dst_ops *ops);
76 static int ip6_pkt_discard(struct sk_buff *skb);
77 static int ip6_pkt_discard_out(struct sk_buff *skb);
78 static void ip6_link_failure(struct sk_buff *skb);
79 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
81 #ifdef CONFIG_IPV6_ROUTE_INFO
82 static struct rt6_info *rt6_add_route_info(struct net *net,
83 const struct in6_addr *prefix, int prefixlen,
84 const struct in6_addr *gwaddr, int ifindex,
86 static struct rt6_info *rt6_get_route_info(struct net *net,
87 const struct in6_addr *prefix, int prefixlen,
88 const struct in6_addr *gwaddr, int ifindex);
91 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
93 struct rt6_info *rt = (struct rt6_info *) dst;
94 struct inet_peer *peer;
97 if (!(rt->dst.flags & DST_HOST))
101 rt6_bind_peer(rt, 1);
103 peer = rt->rt6i_peer;
105 u32 *old_p = __DST_METRICS_PTR(old);
106 unsigned long prev, new;
109 if (inet_metrics_new(peer))
110 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
112 new = (unsigned long) p;
113 prev = cmpxchg(&dst->_metrics, old, new);
116 p = __DST_METRICS_PTR(prev);
117 if (prev & DST_METRICS_READ_ONLY)
124 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
126 struct in6_addr *p = &rt->rt6i_gateway;
128 if (!ipv6_addr_any(p))
129 return (const void *) p;
133 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
135 struct rt6_info *rt = (struct rt6_info *) dst;
138 daddr = choose_neigh_daddr(rt, daddr);
139 n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
142 return neigh_create(&nd_tbl, daddr, dst->dev);
145 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
147 struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
149 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
153 dst_set_neighbour(&rt->dst, n);
158 static struct dst_ops ip6_dst_ops_template = {
160 .protocol = cpu_to_be16(ETH_P_IPV6),
163 .check = ip6_dst_check,
164 .default_advmss = ip6_default_advmss,
166 .cow_metrics = ipv6_cow_metrics,
167 .destroy = ip6_dst_destroy,
168 .ifdown = ip6_dst_ifdown,
169 .negative_advice = ip6_negative_advice,
170 .link_failure = ip6_link_failure,
171 .update_pmtu = ip6_rt_update_pmtu,
172 .local_out = __ip6_local_out,
173 .neigh_lookup = ip6_neigh_lookup,
176 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
178 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
180 return mtu ? : dst->dev->mtu;
183 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
187 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
193 static struct dst_ops ip6_dst_blackhole_ops = {
195 .protocol = cpu_to_be16(ETH_P_IPV6),
196 .destroy = ip6_dst_destroy,
197 .check = ip6_dst_check,
198 .mtu = ip6_blackhole_mtu,
199 .default_advmss = ip6_default_advmss,
200 .update_pmtu = ip6_rt_blackhole_update_pmtu,
201 .cow_metrics = ip6_rt_blackhole_cow_metrics,
202 .neigh_lookup = ip6_neigh_lookup,
205 static const u32 ip6_template_metrics[RTAX_MAX] = {
206 [RTAX_HOPLIMIT - 1] = 0,
209 static struct rt6_info ip6_null_entry_template = {
211 .__refcnt = ATOMIC_INIT(1),
214 .error = -ENETUNREACH,
215 .input = ip6_pkt_discard,
216 .output = ip6_pkt_discard_out,
218 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
219 .rt6i_protocol = RTPROT_KERNEL,
220 .rt6i_metric = ~(u32) 0,
221 .rt6i_ref = ATOMIC_INIT(1),
224 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
226 static int ip6_pkt_prohibit(struct sk_buff *skb);
227 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
229 static struct rt6_info ip6_prohibit_entry_template = {
231 .__refcnt = ATOMIC_INIT(1),
235 .input = ip6_pkt_prohibit,
236 .output = ip6_pkt_prohibit_out,
238 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
239 .rt6i_protocol = RTPROT_KERNEL,
240 .rt6i_metric = ~(u32) 0,
241 .rt6i_ref = ATOMIC_INIT(1),
244 static struct rt6_info ip6_blk_hole_entry_template = {
246 .__refcnt = ATOMIC_INIT(1),
250 .input = dst_discard,
251 .output = dst_discard,
253 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
254 .rt6i_protocol = RTPROT_KERNEL,
255 .rt6i_metric = ~(u32) 0,
256 .rt6i_ref = ATOMIC_INIT(1),
261 /* allocate dst with ip6_dst_ops */
262 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
263 struct net_device *dev,
266 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
269 memset(&rt->rt6i_table, 0,
270 sizeof(*rt) - sizeof(struct dst_entry));
275 static void ip6_dst_destroy(struct dst_entry *dst)
277 struct rt6_info *rt = (struct rt6_info *)dst;
278 struct inet6_dev *idev = rt->rt6i_idev;
279 struct inet_peer *peer = rt->rt6i_peer;
281 if (!(rt->dst.flags & DST_HOST))
282 dst_destroy_metrics_generic(dst);
285 rt->rt6i_idev = NULL;
289 if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
290 dst_release(dst->from);
293 rt->rt6i_peer = NULL;
298 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
300 static u32 rt6_peer_genid(void)
302 return atomic_read(&__rt6_peer_genid);
305 void rt6_bind_peer(struct rt6_info *rt, int create)
307 struct inet_peer *peer;
309 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
310 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
313 rt->rt6i_peer_genid = rt6_peer_genid();
316 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
319 struct rt6_info *rt = (struct rt6_info *)dst;
320 struct inet6_dev *idev = rt->rt6i_idev;
321 struct net_device *loopback_dev =
322 dev_net(dev)->loopback_dev;
324 if (dev != loopback_dev && idev && idev->dev == dev) {
325 struct inet6_dev *loopback_idev =
326 in6_dev_get(loopback_dev);
328 rt->rt6i_idev = loopback_idev;
334 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
336 struct rt6_info *ort = NULL;
338 if (rt->rt6i_flags & RTF_EXPIRES) {
339 if (time_after(jiffies, rt->dst.expires))
341 } else if (rt->dst.from) {
342 ort = (struct rt6_info *) rt->dst.from;
343 return (ort->rt6i_flags & RTF_EXPIRES) &&
344 time_after(jiffies, ort->dst.expires);
349 static inline int rt6_need_strict(const struct in6_addr *daddr)
351 return ipv6_addr_type(daddr) &
352 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
356 * Route lookup. Any table->tb6_lock is implied.
359 static inline struct rt6_info *rt6_device_match(struct net *net,
361 const struct in6_addr *saddr,
365 struct rt6_info *local = NULL;
366 struct rt6_info *sprt;
368 if (!oif && ipv6_addr_any(saddr))
371 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
372 struct net_device *dev = sprt->dst.dev;
375 if (dev->ifindex == oif)
377 if (dev->flags & IFF_LOOPBACK) {
378 if (!sprt->rt6i_idev ||
379 sprt->rt6i_idev->dev->ifindex != oif) {
380 if (flags & RT6_LOOKUP_F_IFACE && oif)
382 if (local && (!oif ||
383 local->rt6i_idev->dev->ifindex == oif))
389 if (ipv6_chk_addr(net, saddr, dev,
390 flags & RT6_LOOKUP_F_IFACE))
399 if (flags & RT6_LOOKUP_F_IFACE)
400 return net->ipv6.ip6_null_entry;
406 #ifdef CONFIG_IPV6_ROUTER_PREF
407 static void rt6_probe(struct rt6_info *rt)
409 struct neighbour *neigh;
411 * Okay, this does not seem to be appropriate
412 * for now, however, we need to check if it
413 * is really so; aka Router Reachability Probing.
415 * Router Reachability Probe MUST be rate-limited
416 * to no more than one per minute.
419 neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
420 if (!neigh || (neigh->nud_state & NUD_VALID))
422 read_lock_bh(&neigh->lock);
423 if (!(neigh->nud_state & NUD_VALID) &&
424 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
425 struct in6_addr mcaddr;
426 struct in6_addr *target;
428 neigh->updated = jiffies;
429 read_unlock_bh(&neigh->lock);
431 target = (struct in6_addr *)&neigh->primary_key;
432 addrconf_addr_solict_mult(target, &mcaddr);
433 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
435 read_unlock_bh(&neigh->lock);
441 static inline void rt6_probe(struct rt6_info *rt)
447 * Default Router Selection (RFC 2461 6.3.6)
449 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
451 struct net_device *dev = rt->dst.dev;
452 if (!oif || dev->ifindex == oif)
454 if ((dev->flags & IFF_LOOPBACK) &&
455 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
460 static inline int rt6_check_neigh(struct rt6_info *rt)
462 struct neighbour *neigh;
466 neigh = dst_get_neighbour_noref(&rt->dst);
467 if (rt->rt6i_flags & RTF_NONEXTHOP ||
468 !(rt->rt6i_flags & RTF_GATEWAY))
471 read_lock_bh(&neigh->lock);
472 if (neigh->nud_state & NUD_VALID)
474 #ifdef CONFIG_IPV6_ROUTER_PREF
475 else if (neigh->nud_state & NUD_FAILED)
480 read_unlock_bh(&neigh->lock);
487 static int rt6_score_route(struct rt6_info *rt, int oif,
492 m = rt6_check_dev(rt, oif);
493 if (!m && (strict & RT6_LOOKUP_F_IFACE))
495 #ifdef CONFIG_IPV6_ROUTER_PREF
496 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
498 n = rt6_check_neigh(rt);
499 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
504 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
505 int *mpri, struct rt6_info *match)
509 if (rt6_check_expired(rt))
512 m = rt6_score_route(rt, oif, strict);
517 if (strict & RT6_LOOKUP_F_REACHABLE)
521 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
529 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
530 struct rt6_info *rr_head,
531 u32 metric, int oif, int strict)
533 struct rt6_info *rt, *match;
537 for (rt = rr_head; rt && rt->rt6i_metric == metric;
538 rt = rt->dst.rt6_next)
539 match = find_match(rt, oif, strict, &mpri, match);
540 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
541 rt = rt->dst.rt6_next)
542 match = find_match(rt, oif, strict, &mpri, match);
547 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
549 struct rt6_info *match, *rt0;
554 fn->rr_ptr = rt0 = fn->leaf;
556 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
559 (strict & RT6_LOOKUP_F_REACHABLE)) {
560 struct rt6_info *next = rt0->dst.rt6_next;
562 /* no entries matched; do round-robin */
563 if (!next || next->rt6i_metric != rt0->rt6i_metric)
570 net = dev_net(rt0->dst.dev);
571 return match ? match : net->ipv6.ip6_null_entry;
574 #ifdef CONFIG_IPV6_ROUTE_INFO
575 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
576 const struct in6_addr *gwaddr)
578 struct net *net = dev_net(dev);
579 struct route_info *rinfo = (struct route_info *) opt;
580 struct in6_addr prefix_buf, *prefix;
582 unsigned long lifetime;
585 if (len < sizeof(struct route_info)) {
589 /* Sanity check for prefix_len and length */
590 if (rinfo->length > 3) {
592 } else if (rinfo->prefix_len > 128) {
594 } else if (rinfo->prefix_len > 64) {
595 if (rinfo->length < 2) {
598 } else if (rinfo->prefix_len > 0) {
599 if (rinfo->length < 1) {
604 pref = rinfo->route_pref;
605 if (pref == ICMPV6_ROUTER_PREF_INVALID)
608 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
610 if (rinfo->length == 3)
611 prefix = (struct in6_addr *)rinfo->prefix;
613 /* this function is safe */
614 ipv6_addr_prefix(&prefix_buf,
615 (struct in6_addr *)rinfo->prefix,
617 prefix = &prefix_buf;
620 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
623 if (rt && !lifetime) {
629 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
632 rt->rt6i_flags = RTF_ROUTEINFO |
633 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
636 if (!addrconf_finite_timeout(lifetime))
637 rt6_clean_expires(rt);
639 rt6_set_expires(rt, jiffies + HZ * lifetime);
641 dst_release(&rt->dst);
647 #define BACKTRACK(__net, saddr) \
649 if (rt == __net->ipv6.ip6_null_entry) { \
650 struct fib6_node *pn; \
652 if (fn->fn_flags & RTN_TL_ROOT) \
655 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
656 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
659 if (fn->fn_flags & RTN_RTINFO) \
665 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
666 struct fib6_table *table,
667 struct flowi6 *fl6, int flags)
669 struct fib6_node *fn;
672 read_lock_bh(&table->tb6_lock);
673 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
676 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
677 BACKTRACK(net, &fl6->saddr);
679 dst_use(&rt->dst, jiffies);
680 read_unlock_bh(&table->tb6_lock);
685 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
688 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
690 EXPORT_SYMBOL_GPL(ip6_route_lookup);
692 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
693 const struct in6_addr *saddr, int oif, int strict)
695 struct flowi6 fl6 = {
699 struct dst_entry *dst;
700 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
703 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
704 flags |= RT6_LOOKUP_F_HAS_SADDR;
707 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
709 return (struct rt6_info *) dst;
716 EXPORT_SYMBOL(rt6_lookup);
718 /* ip6_ins_rt is called with FREE table->tb6_lock.
719 It takes new route entry, the addition fails by any reason the
720 route is freed. In any case, if caller does not hold it, it may
724 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
727 struct fib6_table *table;
729 table = rt->rt6i_table;
730 write_lock_bh(&table->tb6_lock);
731 err = fib6_add(&table->tb6_root, rt, info);
732 write_unlock_bh(&table->tb6_lock);
737 int ip6_ins_rt(struct rt6_info *rt)
739 struct nl_info info = {
740 .nl_net = dev_net(rt->dst.dev),
742 return __ip6_ins_rt(rt, &info);
745 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
746 const struct in6_addr *daddr,
747 const struct in6_addr *saddr)
755 rt = ip6_rt_copy(ort, daddr);
758 int attempts = !in_softirq();
760 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
761 if (ort->rt6i_dst.plen != 128 &&
762 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
763 rt->rt6i_flags |= RTF_ANYCAST;
764 rt->rt6i_gateway = *daddr;
767 rt->rt6i_flags |= RTF_CACHE;
769 #ifdef CONFIG_IPV6_SUBTREES
770 if (rt->rt6i_src.plen && saddr) {
771 rt->rt6i_src.addr = *saddr;
772 rt->rt6i_src.plen = 128;
777 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
778 struct net *net = dev_net(rt->dst.dev);
779 int saved_rt_min_interval =
780 net->ipv6.sysctl.ip6_rt_gc_min_interval;
781 int saved_rt_elasticity =
782 net->ipv6.sysctl.ip6_rt_gc_elasticity;
784 if (attempts-- > 0) {
785 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
786 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
788 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
790 net->ipv6.sysctl.ip6_rt_gc_elasticity =
792 net->ipv6.sysctl.ip6_rt_gc_min_interval =
793 saved_rt_min_interval;
799 "ipv6: Neighbour table overflow.\n");
808 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
809 const struct in6_addr *daddr)
811 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
814 rt->rt6i_flags |= RTF_CACHE;
815 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
820 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
821 struct flowi6 *fl6, int flags)
823 struct fib6_node *fn;
824 struct rt6_info *rt, *nrt;
828 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
830 strict |= flags & RT6_LOOKUP_F_IFACE;
833 read_lock_bh(&table->tb6_lock);
836 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
839 rt = rt6_select(fn, oif, strict | reachable);
841 BACKTRACK(net, &fl6->saddr);
842 if (rt == net->ipv6.ip6_null_entry ||
843 rt->rt6i_flags & RTF_CACHE)
847 read_unlock_bh(&table->tb6_lock);
849 if (!dst_get_neighbour_noref_raw(&rt->dst) &&
850 !(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_LOCAL)))
851 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
852 else if (!(rt->dst.flags & DST_HOST))
853 nrt = rt6_alloc_clone(rt, &fl6->daddr);
857 dst_release(&rt->dst);
858 rt = nrt ? : net->ipv6.ip6_null_entry;
862 err = ip6_ins_rt(nrt);
871 * Race condition! In the gap, when table->tb6_lock was
872 * released someone could insert this route. Relookup.
874 dst_release(&rt->dst);
883 read_unlock_bh(&table->tb6_lock);
885 rt->dst.lastuse = jiffies;
891 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
892 struct flowi6 *fl6, int flags)
894 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
897 static struct dst_entry *ip6_route_input_lookup(struct net *net,
898 struct net_device *dev,
899 struct flowi6 *fl6, int flags)
901 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
902 flags |= RT6_LOOKUP_F_IFACE;
904 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
907 void ip6_route_input(struct sk_buff *skb)
909 const struct ipv6hdr *iph = ipv6_hdr(skb);
910 struct net *net = dev_net(skb->dev);
911 int flags = RT6_LOOKUP_F_HAS_SADDR;
912 struct flowi6 fl6 = {
913 .flowi6_iif = skb->dev->ifindex,
916 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
917 .flowi6_mark = skb->mark,
918 .flowi6_proto = iph->nexthdr,
921 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
924 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
925 struct flowi6 *fl6, int flags)
927 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
930 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
935 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
936 flags |= RT6_LOOKUP_F_IFACE;
938 if (!ipv6_addr_any(&fl6->saddr))
939 flags |= RT6_LOOKUP_F_HAS_SADDR;
941 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
943 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
946 EXPORT_SYMBOL(ip6_route_output);
948 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
950 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
951 struct dst_entry *new = NULL;
953 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
955 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
960 new->input = dst_discard;
961 new->output = dst_discard;
963 if (dst_metrics_read_only(&ort->dst))
964 new->_metrics = ort->dst._metrics;
966 dst_copy_metrics(new, &ort->dst);
967 rt->rt6i_idev = ort->rt6i_idev;
969 in6_dev_hold(rt->rt6i_idev);
971 rt->rt6i_gateway = ort->rt6i_gateway;
972 rt->rt6i_flags = ort->rt6i_flags;
973 rt6_clean_expires(rt);
976 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
977 #ifdef CONFIG_IPV6_SUBTREES
978 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
984 dst_release(dst_orig);
985 return new ? new : ERR_PTR(-ENOMEM);
989 * Destination cache support functions
992 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
996 rt = (struct rt6_info *) dst;
998 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
999 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1001 rt6_bind_peer(rt, 0);
1002 rt->rt6i_peer_genid = rt6_peer_genid();
1009 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1011 struct rt6_info *rt = (struct rt6_info *) dst;
1014 if (rt->rt6i_flags & RTF_CACHE) {
1015 if (rt6_check_expired(rt)) {
1027 static void ip6_link_failure(struct sk_buff *skb)
1029 struct rt6_info *rt;
1031 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1033 rt = (struct rt6_info *) skb_dst(skb);
1035 if (rt->rt6i_flags & RTF_CACHE)
1036 rt6_update_expires(rt, 0);
1037 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1038 rt->rt6i_node->fn_sernum = -1;
1042 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1044 struct rt6_info *rt6 = (struct rt6_info*)dst;
1046 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1047 rt6->rt6i_flags |= RTF_MODIFIED;
1048 if (mtu < IPV6_MIN_MTU) {
1049 u32 features = dst_metric(dst, RTAX_FEATURES);
1051 features |= RTAX_FEATURE_ALLFRAG;
1052 dst_metric_set(dst, RTAX_FEATURES, features);
1054 dst_metric_set(dst, RTAX_MTU, mtu);
1058 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1060 struct net_device *dev = dst->dev;
1061 unsigned int mtu = dst_mtu(dst);
1062 struct net *net = dev_net(dev);
1064 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1066 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1067 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1070 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1071 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1072 * IPV6_MAXPLEN is also valid and means: "any MSS,
1073 * rely only on pmtu discovery"
1075 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1080 static unsigned int ip6_mtu(const struct dst_entry *dst)
1082 struct inet6_dev *idev;
1083 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1091 idev = __in6_dev_get(dst->dev);
1093 mtu = idev->cnf.mtu6;
1099 static struct dst_entry *icmp6_dst_gc_list;
1100 static DEFINE_SPINLOCK(icmp6_dst_lock);
1102 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1103 struct neighbour *neigh,
1106 struct dst_entry *dst;
1107 struct rt6_info *rt;
1108 struct inet6_dev *idev = in6_dev_get(dev);
1109 struct net *net = dev_net(dev);
1111 if (unlikely(!idev))
1112 return ERR_PTR(-ENODEV);
1114 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1115 if (unlikely(!rt)) {
1117 dst = ERR_PTR(-ENOMEM);
1124 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1125 if (IS_ERR(neigh)) {
1128 return ERR_CAST(neigh);
1132 rt->dst.flags |= DST_HOST;
1133 rt->dst.output = ip6_output;
1134 dst_set_neighbour(&rt->dst, neigh);
1135 atomic_set(&rt->dst.__refcnt, 1);
1136 rt->rt6i_dst.addr = fl6->daddr;
1137 rt->rt6i_dst.plen = 128;
1138 rt->rt6i_idev = idev;
1139 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1141 spin_lock_bh(&icmp6_dst_lock);
1142 rt->dst.next = icmp6_dst_gc_list;
1143 icmp6_dst_gc_list = &rt->dst;
1144 spin_unlock_bh(&icmp6_dst_lock);
1146 fib6_force_start_gc(net);
1148 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1154 int icmp6_dst_gc(void)
1156 struct dst_entry *dst, **pprev;
1159 spin_lock_bh(&icmp6_dst_lock);
1160 pprev = &icmp6_dst_gc_list;
1162 while ((dst = *pprev) != NULL) {
1163 if (!atomic_read(&dst->__refcnt)) {
1172 spin_unlock_bh(&icmp6_dst_lock);
1177 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1180 struct dst_entry *dst, **pprev;
1182 spin_lock_bh(&icmp6_dst_lock);
1183 pprev = &icmp6_dst_gc_list;
1184 while ((dst = *pprev) != NULL) {
1185 struct rt6_info *rt = (struct rt6_info *) dst;
1186 if (func(rt, arg)) {
1193 spin_unlock_bh(&icmp6_dst_lock);
1196 static int ip6_dst_gc(struct dst_ops *ops)
1198 unsigned long now = jiffies;
1199 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1200 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1201 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1202 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1203 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1204 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1207 entries = dst_entries_get_fast(ops);
1208 if (time_after(rt_last_gc + rt_min_interval, now) &&
1209 entries <= rt_max_size)
1212 net->ipv6.ip6_rt_gc_expire++;
1213 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1214 net->ipv6.ip6_rt_last_gc = now;
1215 entries = dst_entries_get_slow(ops);
1216 if (entries < ops->gc_thresh)
1217 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1219 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1220 return entries > rt_max_size;
1223 /* Clean host part of a prefix. Not necessary in radix tree,
1224 but results in cleaner routing tables.
1226 Remove it only when all the things will work!
1229 int ip6_dst_hoplimit(struct dst_entry *dst)
1231 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1232 if (hoplimit == 0) {
1233 struct net_device *dev = dst->dev;
1234 struct inet6_dev *idev;
1237 idev = __in6_dev_get(dev);
1239 hoplimit = idev->cnf.hop_limit;
1241 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1246 EXPORT_SYMBOL(ip6_dst_hoplimit);
1252 int ip6_route_add(struct fib6_config *cfg)
1255 struct net *net = cfg->fc_nlinfo.nl_net;
1256 struct rt6_info *rt = NULL;
1257 struct net_device *dev = NULL;
1258 struct inet6_dev *idev = NULL;
1259 struct fib6_table *table;
1262 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1264 #ifndef CONFIG_IPV6_SUBTREES
1265 if (cfg->fc_src_len)
1268 if (cfg->fc_ifindex) {
1270 dev = dev_get_by_index(net, cfg->fc_ifindex);
1273 idev = in6_dev_get(dev);
1278 if (cfg->fc_metric == 0)
1279 cfg->fc_metric = IP6_RT_PRIO_USER;
1282 if (cfg->fc_nlinfo.nlh &&
1283 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1284 table = fib6_get_table(net, cfg->fc_table);
1286 printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1287 table = fib6_new_table(net, cfg->fc_table);
1290 table = fib6_new_table(net, cfg->fc_table);
1296 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1303 rt->dst.obsolete = -1;
1305 if (cfg->fc_flags & RTF_EXPIRES)
1306 rt6_set_expires(rt, jiffies +
1307 clock_t_to_jiffies(cfg->fc_expires));
1309 rt6_clean_expires(rt);
1311 if (cfg->fc_protocol == RTPROT_UNSPEC)
1312 cfg->fc_protocol = RTPROT_BOOT;
1313 rt->rt6i_protocol = cfg->fc_protocol;
1315 addr_type = ipv6_addr_type(&cfg->fc_dst);
1317 if (addr_type & IPV6_ADDR_MULTICAST)
1318 rt->dst.input = ip6_mc_input;
1319 else if (cfg->fc_flags & RTF_LOCAL)
1320 rt->dst.input = ip6_input;
1322 rt->dst.input = ip6_forward;
1324 rt->dst.output = ip6_output;
1326 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1327 rt->rt6i_dst.plen = cfg->fc_dst_len;
1328 if (rt->rt6i_dst.plen == 128)
1329 rt->dst.flags |= DST_HOST;
1331 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1332 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1337 dst_init_metrics(&rt->dst, metrics, 0);
1339 #ifdef CONFIG_IPV6_SUBTREES
1340 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1341 rt->rt6i_src.plen = cfg->fc_src_len;
1344 rt->rt6i_metric = cfg->fc_metric;
1346 /* We cannot add true routes via loopback here,
1347 they would result in kernel looping; promote them to reject routes
1349 if ((cfg->fc_flags & RTF_REJECT) ||
1350 (dev && (dev->flags & IFF_LOOPBACK) &&
1351 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1352 !(cfg->fc_flags & RTF_LOCAL))) {
1353 /* hold loopback dev/idev if we haven't done so. */
1354 if (dev != net->loopback_dev) {
1359 dev = net->loopback_dev;
1361 idev = in6_dev_get(dev);
1367 rt->dst.output = ip6_pkt_discard_out;
1368 rt->dst.input = ip6_pkt_discard;
1369 rt->dst.error = -ENETUNREACH;
1370 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1374 if (cfg->fc_flags & RTF_GATEWAY) {
1375 const struct in6_addr *gw_addr;
1378 gw_addr = &cfg->fc_gateway;
1379 rt->rt6i_gateway = *gw_addr;
1380 gwa_type = ipv6_addr_type(gw_addr);
1382 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1383 struct rt6_info *grt;
1385 /* IPv6 strictly inhibits using not link-local
1386 addresses as nexthop address.
1387 Otherwise, router will not able to send redirects.
1388 It is very good, but in some (rare!) circumstances
1389 (SIT, PtP, NBMA NOARP links) it is handy to allow
1390 some exceptions. --ANK
1393 if (!(gwa_type & IPV6_ADDR_UNICAST))
1396 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1398 err = -EHOSTUNREACH;
1402 if (dev != grt->dst.dev) {
1403 dst_release(&grt->dst);
1408 idev = grt->rt6i_idev;
1410 in6_dev_hold(grt->rt6i_idev);
1412 if (!(grt->rt6i_flags & RTF_GATEWAY))
1414 dst_release(&grt->dst);
1420 if (!dev || (dev->flags & IFF_LOOPBACK))
1428 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1429 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1433 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1434 rt->rt6i_prefsrc.plen = 128;
1436 rt->rt6i_prefsrc.plen = 0;
1438 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1439 err = rt6_bind_neighbour(rt, dev);
1444 rt->rt6i_flags = cfg->fc_flags;
1451 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1452 int type = nla_type(nla);
1455 if (type > RTAX_MAX) {
1460 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1466 rt->rt6i_idev = idev;
1467 rt->rt6i_table = table;
1469 cfg->fc_nlinfo.nl_net = dev_net(dev);
1471 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1483 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1486 struct fib6_table *table;
1487 struct net *net = dev_net(rt->dst.dev);
1489 if (rt == net->ipv6.ip6_null_entry) {
1494 table = rt->rt6i_table;
1495 write_lock_bh(&table->tb6_lock);
1496 err = fib6_del(rt, info);
1497 write_unlock_bh(&table->tb6_lock);
1500 dst_release(&rt->dst);
1504 int ip6_del_rt(struct rt6_info *rt)
1506 struct nl_info info = {
1507 .nl_net = dev_net(rt->dst.dev),
1509 return __ip6_del_rt(rt, &info);
1512 static int ip6_route_del(struct fib6_config *cfg)
1514 struct fib6_table *table;
1515 struct fib6_node *fn;
1516 struct rt6_info *rt;
1519 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1523 read_lock_bh(&table->tb6_lock);
1525 fn = fib6_locate(&table->tb6_root,
1526 &cfg->fc_dst, cfg->fc_dst_len,
1527 &cfg->fc_src, cfg->fc_src_len);
1530 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1531 if (cfg->fc_ifindex &&
1533 rt->dst.dev->ifindex != cfg->fc_ifindex))
1535 if (cfg->fc_flags & RTF_GATEWAY &&
1536 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1538 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1541 read_unlock_bh(&table->tb6_lock);
1543 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1546 read_unlock_bh(&table->tb6_lock);
1554 struct ip6rd_flowi {
1556 struct in6_addr gateway;
1559 static struct rt6_info *__ip6_route_redirect(struct net *net,
1560 struct fib6_table *table,
1564 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1565 struct rt6_info *rt;
1566 struct fib6_node *fn;
1569 * Get the "current" route for this destination and
1570 * check if the redirect has come from approriate router.
1572 * RFC 2461 specifies that redirects should only be
1573 * accepted if they come from the nexthop to the target.
1574 * Due to the way the routes are chosen, this notion
1575 * is a bit fuzzy and one might need to check all possible
1579 read_lock_bh(&table->tb6_lock);
1580 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1582 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1584 * Current route is on-link; redirect is always invalid.
1586 * Seems, previous statement is not true. It could
1587 * be node, which looks for us as on-link (f.e. proxy ndisc)
1588 * But then router serving it might decide, that we should
1589 * know truth 8)8) --ANK (980726).
1591 if (rt6_check_expired(rt))
1593 if (!(rt->rt6i_flags & RTF_GATEWAY))
1595 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1597 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1603 rt = net->ipv6.ip6_null_entry;
1604 BACKTRACK(net, &fl6->saddr);
1608 read_unlock_bh(&table->tb6_lock);
1613 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1614 const struct in6_addr *src,
1615 const struct in6_addr *gateway,
1616 struct net_device *dev)
1618 int flags = RT6_LOOKUP_F_HAS_SADDR;
1619 struct net *net = dev_net(dev);
1620 struct ip6rd_flowi rdfl = {
1622 .flowi6_oif = dev->ifindex,
1628 rdfl.gateway = *gateway;
1630 if (rt6_need_strict(dest))
1631 flags |= RT6_LOOKUP_F_IFACE;
1633 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1634 flags, __ip6_route_redirect);
1637 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1638 const struct in6_addr *saddr,
1639 struct neighbour *neigh, u8 *lladdr, int on_link)
1641 struct rt6_info *rt, *nrt = NULL;
1642 struct netevent_redirect netevent;
1643 struct net *net = dev_net(neigh->dev);
1645 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1647 if (rt == net->ipv6.ip6_null_entry) {
1648 if (net_ratelimit())
1649 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1650 "for redirect target\n");
1655 * We have finally decided to accept it.
1658 neigh_update(neigh, lladdr, NUD_STALE,
1659 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1660 NEIGH_UPDATE_F_OVERRIDE|
1661 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1662 NEIGH_UPDATE_F_ISROUTER))
1666 * Redirect received -> path was valid.
1667 * Look, redirects are sent only in response to data packets,
1668 * so that this nexthop apparently is reachable. --ANK
1670 dst_confirm(&rt->dst);
1672 /* Duplicate redirect: silently ignore. */
1673 if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1676 nrt = ip6_rt_copy(rt, dest);
1680 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1682 nrt->rt6i_flags &= ~RTF_GATEWAY;
1684 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1685 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1687 if (ip6_ins_rt(nrt))
1690 netevent.old = &rt->dst;
1691 netevent.new = &nrt->dst;
1692 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1694 if (rt->rt6i_flags & RTF_CACHE) {
1700 dst_release(&rt->dst);
1704 * Handle ICMP "packet too big" messages
1705 * i.e. Path MTU discovery
1708 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1709 struct net *net, u32 pmtu, int ifindex)
1711 struct rt6_info *rt, *nrt;
1714 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1718 if (rt6_check_expired(rt)) {
1723 if (pmtu >= dst_mtu(&rt->dst))
1726 if (pmtu < IPV6_MIN_MTU) {
1728 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1729 * MTU (1280) and a fragment header should always be included
1730 * after a node receiving Too Big message reporting PMTU is
1731 * less than the IPv6 Minimum Link MTU.
1733 pmtu = IPV6_MIN_MTU;
1737 /* New mtu received -> path was valid.
1738 They are sent only in response to data packets,
1739 so that this nexthop apparently is reachable. --ANK
1741 dst_confirm(&rt->dst);
1743 /* Host route. If it is static, it would be better
1744 not to override it, but add new one, so that
1745 when cache entry will expire old pmtu
1746 would return automatically.
1748 if (rt->rt6i_flags & RTF_CACHE) {
1749 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1751 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1752 features |= RTAX_FEATURE_ALLFRAG;
1753 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1755 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1756 rt->rt6i_flags |= RTF_MODIFIED;
1761 Two cases are possible:
1762 1. It is connected route. Action: COW
1763 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1765 if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1766 nrt = rt6_alloc_cow(rt, daddr, saddr);
1768 nrt = rt6_alloc_clone(rt, daddr);
1771 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1773 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1774 features |= RTAX_FEATURE_ALLFRAG;
1775 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1778 /* According to RFC 1981, detecting PMTU increase shouldn't be
1779 * happened within 5 mins, the recommended timer is 10 mins.
1780 * Here this route expiration time is set to ip6_rt_mtu_expires
1781 * which is 10 mins. After 10 mins the decreased pmtu is expired
1782 * and detecting PMTU increase will be automatically happened.
1784 rt6_update_expires(nrt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1785 nrt->rt6i_flags |= RTF_DYNAMIC;
1789 dst_release(&rt->dst);
1792 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1793 struct net_device *dev, u32 pmtu)
1795 struct net *net = dev_net(dev);
1798 * RFC 1981 states that a node "MUST reduce the size of the packets it
1799 * is sending along the path" that caused the Packet Too Big message.
1800 * Since it's not possible in the general case to determine which
1801 * interface was used to send the original packet, we update the MTU
1802 * on the interface that will be used to send future packets. We also
1803 * update the MTU on the interface that received the Packet Too Big in
1804 * case the original packet was forced out that interface with
1805 * SO_BINDTODEVICE or similar. This is the next best thing to the
1806 * correct behaviour, which would be to update the MTU on all
1809 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1810 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1814 * Misc support functions
1817 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1818 const struct in6_addr *dest)
1820 struct net *net = dev_net(ort->dst.dev);
1821 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1825 rt->dst.input = ort->dst.input;
1826 rt->dst.output = ort->dst.output;
1827 rt->dst.flags |= DST_HOST;
1829 rt->rt6i_dst.addr = *dest;
1830 rt->rt6i_dst.plen = 128;
1831 dst_copy_metrics(&rt->dst, &ort->dst);
1832 rt->dst.error = ort->dst.error;
1833 rt->rt6i_idev = ort->rt6i_idev;
1835 in6_dev_hold(rt->rt6i_idev);
1836 rt->dst.lastuse = jiffies;
1838 rt->rt6i_gateway = ort->rt6i_gateway;
1839 rt->rt6i_flags = ort->rt6i_flags;
1840 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1841 (RTF_DEFAULT | RTF_ADDRCONF))
1842 rt6_set_from(rt, ort);
1844 rt6_clean_expires(rt);
1845 rt->rt6i_metric = 0;
1847 #ifdef CONFIG_IPV6_SUBTREES
1848 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1850 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1851 rt->rt6i_table = ort->rt6i_table;
1856 #ifdef CONFIG_IPV6_ROUTE_INFO
1857 static struct rt6_info *rt6_get_route_info(struct net *net,
1858 const struct in6_addr *prefix, int prefixlen,
1859 const struct in6_addr *gwaddr, int ifindex)
1861 struct fib6_node *fn;
1862 struct rt6_info *rt = NULL;
1863 struct fib6_table *table;
1865 table = fib6_get_table(net, RT6_TABLE_INFO);
1869 write_lock_bh(&table->tb6_lock);
1870 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1874 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1875 if (rt->dst.dev->ifindex != ifindex)
1877 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1879 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1885 write_unlock_bh(&table->tb6_lock);
1889 static struct rt6_info *rt6_add_route_info(struct net *net,
1890 const struct in6_addr *prefix, int prefixlen,
1891 const struct in6_addr *gwaddr, int ifindex,
1894 struct fib6_config cfg = {
1895 .fc_table = RT6_TABLE_INFO,
1896 .fc_metric = IP6_RT_PRIO_USER,
1897 .fc_ifindex = ifindex,
1898 .fc_dst_len = prefixlen,
1899 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1900 RTF_UP | RTF_PREF(pref),
1902 .fc_nlinfo.nlh = NULL,
1903 .fc_nlinfo.nl_net = net,
1906 cfg.fc_dst = *prefix;
1907 cfg.fc_gateway = *gwaddr;
1909 /* We should treat it as a default route if prefix length is 0. */
1911 cfg.fc_flags |= RTF_DEFAULT;
1913 ip6_route_add(&cfg);
1915 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1919 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1921 struct rt6_info *rt;
1922 struct fib6_table *table;
1924 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1928 write_lock_bh(&table->tb6_lock);
1929 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1930 if (dev == rt->dst.dev &&
1931 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1932 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1937 write_unlock_bh(&table->tb6_lock);
1941 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1942 struct net_device *dev,
1945 struct fib6_config cfg = {
1946 .fc_table = RT6_TABLE_DFLT,
1947 .fc_metric = IP6_RT_PRIO_USER,
1948 .fc_ifindex = dev->ifindex,
1949 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1950 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1952 .fc_nlinfo.nlh = NULL,
1953 .fc_nlinfo.nl_net = dev_net(dev),
1956 cfg.fc_gateway = *gwaddr;
1958 ip6_route_add(&cfg);
1960 return rt6_get_dflt_router(gwaddr, dev);
1963 void rt6_purge_dflt_routers(struct net *net)
1965 struct rt6_info *rt;
1966 struct fib6_table *table;
1968 /* NOTE: Keep consistent with rt6_get_dflt_router */
1969 table = fib6_get_table(net, RT6_TABLE_DFLT);
1974 read_lock_bh(&table->tb6_lock);
1975 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1976 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1977 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
1979 read_unlock_bh(&table->tb6_lock);
1984 read_unlock_bh(&table->tb6_lock);
1987 static void rtmsg_to_fib6_config(struct net *net,
1988 struct in6_rtmsg *rtmsg,
1989 struct fib6_config *cfg)
1991 memset(cfg, 0, sizeof(*cfg));
1993 cfg->fc_table = RT6_TABLE_MAIN;
1994 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1995 cfg->fc_metric = rtmsg->rtmsg_metric;
1996 cfg->fc_expires = rtmsg->rtmsg_info;
1997 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1998 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1999 cfg->fc_flags = rtmsg->rtmsg_flags;
2001 cfg->fc_nlinfo.nl_net = net;
2003 cfg->fc_dst = rtmsg->rtmsg_dst;
2004 cfg->fc_src = rtmsg->rtmsg_src;
2005 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2008 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2010 struct fib6_config cfg;
2011 struct in6_rtmsg rtmsg;
2015 case SIOCADDRT: /* Add a route */
2016 case SIOCDELRT: /* Delete a route */
2017 if (!capable(CAP_NET_ADMIN))
2019 err = copy_from_user(&rtmsg, arg,
2020 sizeof(struct in6_rtmsg));
2024 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2029 err = ip6_route_add(&cfg);
2032 err = ip6_route_del(&cfg);
2046 * Drop the packet on the floor
2049 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2052 struct dst_entry *dst = skb_dst(skb);
2053 switch (ipstats_mib_noroutes) {
2054 case IPSTATS_MIB_INNOROUTES:
2055 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2056 if (type == IPV6_ADDR_ANY) {
2057 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2058 IPSTATS_MIB_INADDRERRORS);
2062 case IPSTATS_MIB_OUTNOROUTES:
2063 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2064 ipstats_mib_noroutes);
2067 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2072 static int ip6_pkt_discard(struct sk_buff *skb)
2074 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2077 static int ip6_pkt_discard_out(struct sk_buff *skb)
2079 skb->dev = skb_dst(skb)->dev;
2080 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2083 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2085 static int ip6_pkt_prohibit(struct sk_buff *skb)
2087 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2090 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2092 skb->dev = skb_dst(skb)->dev;
2093 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2099 * Allocate a dst for local (unicast / anycast) address.
2102 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2103 const struct in6_addr *addr,
2106 struct net *net = dev_net(idev->dev);
2107 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2108 net->loopback_dev, 0);
2112 if (net_ratelimit())
2113 pr_warning("IPv6: Maximum number of routes reached,"
2114 " consider increasing route/max_size.\n");
2115 return ERR_PTR(-ENOMEM);
2120 rt->dst.flags |= DST_HOST;
2121 rt->dst.input = ip6_input;
2122 rt->dst.output = ip6_output;
2123 rt->rt6i_idev = idev;
2124 rt->dst.obsolete = -1;
2126 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2128 rt->rt6i_flags |= RTF_ANYCAST;
2130 rt->rt6i_flags |= RTF_LOCAL;
2131 err = rt6_bind_neighbour(rt, rt->dst.dev);
2134 return ERR_PTR(err);
2137 rt->rt6i_dst.addr = *addr;
2138 rt->rt6i_dst.plen = 128;
2139 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2141 atomic_set(&rt->dst.__refcnt, 1);
2146 int ip6_route_get_saddr(struct net *net,
2147 struct rt6_info *rt,
2148 const struct in6_addr *daddr,
2150 struct in6_addr *saddr)
2152 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2154 if (rt->rt6i_prefsrc.plen)
2155 *saddr = rt->rt6i_prefsrc.addr;
2157 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2158 daddr, prefs, saddr);
2162 /* remove deleted ip from prefsrc entries */
2163 struct arg_dev_net_ip {
2164 struct net_device *dev;
2166 struct in6_addr *addr;
2169 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2171 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2172 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2173 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2175 if (((void *)rt->dst.dev == dev || !dev) &&
2176 rt != net->ipv6.ip6_null_entry &&
2177 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2178 /* remove prefsrc entry */
2179 rt->rt6i_prefsrc.plen = 0;
2184 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2186 struct net *net = dev_net(ifp->idev->dev);
2187 struct arg_dev_net_ip adni = {
2188 .dev = ifp->idev->dev,
2192 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2195 struct arg_dev_net {
2196 struct net_device *dev;
2200 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2202 const struct arg_dev_net *adn = arg;
2203 const struct net_device *dev = adn->dev;
2205 if ((rt->dst.dev == dev || !dev) &&
2206 rt != adn->net->ipv6.ip6_null_entry)
2212 void rt6_ifdown(struct net *net, struct net_device *dev)
2214 struct arg_dev_net adn = {
2219 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2220 icmp6_clean_all(fib6_ifdown, &adn);
2223 struct rt6_mtu_change_arg
2225 struct net_device *dev;
2229 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2231 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2232 struct inet6_dev *idev;
2234 /* In IPv6 pmtu discovery is not optional,
2235 so that RTAX_MTU lock cannot disable it.
2236 We still use this lock to block changes
2237 caused by addrconf/ndisc.
2240 idev = __in6_dev_get(arg->dev);
2244 /* For administrative MTU increase, there is no way to discover
2245 IPv6 PMTU increase, so PMTU increase should be updated here.
2246 Since RFC 1981 doesn't include administrative MTU increase
2247 update PMTU increase is a MUST. (i.e. jumbo frame)
2250 If new MTU is less than route PMTU, this new MTU will be the
2251 lowest MTU in the path, update the route PMTU to reflect PMTU
2252 decreases; if new MTU is greater than route PMTU, and the
2253 old MTU is the lowest MTU in the path, update the route PMTU
2254 to reflect the increase. In this case if the other nodes' MTU
2255 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2258 if (rt->dst.dev == arg->dev &&
2259 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2260 (dst_mtu(&rt->dst) >= arg->mtu ||
2261 (dst_mtu(&rt->dst) < arg->mtu &&
2262 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2263 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2268 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2270 struct rt6_mtu_change_arg arg = {
2275 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2278 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2279 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2280 [RTA_OIF] = { .type = NLA_U32 },
2281 [RTA_IIF] = { .type = NLA_U32 },
2282 [RTA_PRIORITY] = { .type = NLA_U32 },
2283 [RTA_METRICS] = { .type = NLA_NESTED },
2286 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2287 struct fib6_config *cfg)
2290 struct nlattr *tb[RTA_MAX+1];
2293 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2298 rtm = nlmsg_data(nlh);
2299 memset(cfg, 0, sizeof(*cfg));
2301 cfg->fc_table = rtm->rtm_table;
2302 cfg->fc_dst_len = rtm->rtm_dst_len;
2303 cfg->fc_src_len = rtm->rtm_src_len;
2304 cfg->fc_flags = RTF_UP;
2305 cfg->fc_protocol = rtm->rtm_protocol;
2307 if (rtm->rtm_type == RTN_UNREACHABLE)
2308 cfg->fc_flags |= RTF_REJECT;
2310 if (rtm->rtm_type == RTN_LOCAL)
2311 cfg->fc_flags |= RTF_LOCAL;
2313 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2314 cfg->fc_nlinfo.nlh = nlh;
2315 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2317 if (tb[RTA_GATEWAY]) {
2318 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2319 cfg->fc_flags |= RTF_GATEWAY;
2323 int plen = (rtm->rtm_dst_len + 7) >> 3;
2325 if (nla_len(tb[RTA_DST]) < plen)
2328 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2332 int plen = (rtm->rtm_src_len + 7) >> 3;
2334 if (nla_len(tb[RTA_SRC]) < plen)
2337 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2340 if (tb[RTA_PREFSRC])
2341 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2344 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2346 if (tb[RTA_PRIORITY])
2347 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2349 if (tb[RTA_METRICS]) {
2350 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2351 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2355 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2362 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2364 struct fib6_config cfg;
2367 err = rtm_to_fib6_config(skb, nlh, &cfg);
2371 return ip6_route_del(&cfg);
2374 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2376 struct fib6_config cfg;
2379 err = rtm_to_fib6_config(skb, nlh, &cfg);
2383 return ip6_route_add(&cfg);
2386 static inline size_t rt6_nlmsg_size(void)
2388 return NLMSG_ALIGN(sizeof(struct rtmsg))
2389 + nla_total_size(16) /* RTA_SRC */
2390 + nla_total_size(16) /* RTA_DST */
2391 + nla_total_size(16) /* RTA_GATEWAY */
2392 + nla_total_size(16) /* RTA_PREFSRC */
2393 + nla_total_size(4) /* RTA_TABLE */
2394 + nla_total_size(4) /* RTA_IIF */
2395 + nla_total_size(4) /* RTA_OIF */
2396 + nla_total_size(4) /* RTA_PRIORITY */
2397 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2398 + nla_total_size(sizeof(struct rta_cacheinfo));
2401 static int rt6_fill_node(struct net *net,
2402 struct sk_buff *skb, struct rt6_info *rt,
2403 struct in6_addr *dst, struct in6_addr *src,
2404 int iif, int type, u32 pid, u32 seq,
2405 int prefix, int nowait, unsigned int flags)
2407 const struct inet_peer *peer;
2409 struct nlmsghdr *nlh;
2412 struct neighbour *n;
2415 if (prefix) { /* user wants prefix routes only */
2416 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2417 /* success since this is not a prefix route */
2422 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2426 rtm = nlmsg_data(nlh);
2427 rtm->rtm_family = AF_INET6;
2428 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2429 rtm->rtm_src_len = rt->rt6i_src.plen;
2432 table = rt->rt6i_table->tb6_id;
2434 table = RT6_TABLE_UNSPEC;
2435 rtm->rtm_table = table;
2436 NLA_PUT_U32(skb, RTA_TABLE, table);
2437 if (rt->rt6i_flags & RTF_REJECT)
2438 rtm->rtm_type = RTN_UNREACHABLE;
2439 else if (rt->rt6i_flags & RTF_LOCAL)
2440 rtm->rtm_type = RTN_LOCAL;
2441 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2442 rtm->rtm_type = RTN_LOCAL;
2444 rtm->rtm_type = RTN_UNICAST;
2446 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2447 rtm->rtm_protocol = rt->rt6i_protocol;
2448 if (rt->rt6i_flags & RTF_DYNAMIC)
2449 rtm->rtm_protocol = RTPROT_REDIRECT;
2450 else if (rt->rt6i_flags & RTF_ADDRCONF)
2451 rtm->rtm_protocol = RTPROT_KERNEL;
2452 else if (rt->rt6i_flags & RTF_DEFAULT)
2453 rtm->rtm_protocol = RTPROT_RA;
2455 if (rt->rt6i_flags & RTF_CACHE)
2456 rtm->rtm_flags |= RTM_F_CLONED;
2459 NLA_PUT(skb, RTA_DST, 16, dst);
2460 rtm->rtm_dst_len = 128;
2461 } else if (rtm->rtm_dst_len)
2462 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2463 #ifdef CONFIG_IPV6_SUBTREES
2465 NLA_PUT(skb, RTA_SRC, 16, src);
2466 rtm->rtm_src_len = 128;
2467 } else if (rtm->rtm_src_len)
2468 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2471 #ifdef CONFIG_IPV6_MROUTE
2472 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2473 int err = ip6mr_get_route(net, skb, rtm, nowait);
2478 goto nla_put_failure;
2480 if (err == -EMSGSIZE)
2481 goto nla_put_failure;
2486 NLA_PUT_U32(skb, RTA_IIF, iif);
2488 struct in6_addr saddr_buf;
2489 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2490 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2493 if (rt->rt6i_prefsrc.plen) {
2494 struct in6_addr saddr_buf;
2495 saddr_buf = rt->rt6i_prefsrc.addr;
2496 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2499 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2500 goto nla_put_failure;
2503 n = dst_get_neighbour_noref(&rt->dst);
2505 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2507 goto nla_put_failure;
2513 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2515 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2517 if (!(rt->rt6i_flags & RTF_EXPIRES))
2519 else if (rt->dst.expires - jiffies < INT_MAX)
2520 expires = rt->dst.expires - jiffies;
2524 peer = rt->rt6i_peer;
2526 if (peer && peer->tcp_ts_stamp) {
2528 tsage = get_seconds() - peer->tcp_ts_stamp;
2531 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2532 expires, rt->dst.error) < 0)
2533 goto nla_put_failure;
2535 return nlmsg_end(skb, nlh);
2538 nlmsg_cancel(skb, nlh);
2542 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2544 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2547 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2548 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2549 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2553 return rt6_fill_node(arg->net,
2554 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2555 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2556 prefix, 0, NLM_F_MULTI);
2559 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2561 struct net *net = sock_net(in_skb->sk);
2562 struct nlattr *tb[RTA_MAX+1];
2563 struct rt6_info *rt;
2564 struct sk_buff *skb;
2567 int err, iif = 0, oif = 0;
2569 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2574 memset(&fl6, 0, sizeof(fl6));
2577 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2580 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2584 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2587 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2591 iif = nla_get_u32(tb[RTA_IIF]);
2594 oif = nla_get_u32(tb[RTA_OIF]);
2597 struct net_device *dev;
2600 dev = __dev_get_by_index(net, iif);
2606 fl6.flowi6_iif = iif;
2608 if (!ipv6_addr_any(&fl6.saddr))
2609 flags |= RT6_LOOKUP_F_HAS_SADDR;
2611 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2614 fl6.flowi6_oif = oif;
2616 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2619 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2625 /* Reserve room for dummy headers, this skb can pass
2626 through good chunk of routing engine.
2628 skb_reset_mac_header(skb);
2629 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2631 skb_dst_set(skb, &rt->dst);
2633 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2634 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2635 nlh->nlmsg_seq, 0, 0, 0);
2641 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2646 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2648 struct sk_buff *skb;
2649 struct net *net = info->nl_net;
2654 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2656 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2660 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2661 event, info->pid, seq, 0, 0, 0);
2663 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2664 WARN_ON(err == -EMSGSIZE);
2668 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2669 info->nlh, gfp_any());
2673 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2676 static int ip6_route_dev_notify(struct notifier_block *this,
2677 unsigned long event, void *data)
2679 struct net_device *dev = (struct net_device *)data;
2680 struct net *net = dev_net(dev);
2682 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2683 net->ipv6.ip6_null_entry->dst.dev = dev;
2684 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2685 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2686 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2687 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2688 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2689 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2700 #ifdef CONFIG_PROC_FS
2711 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2713 struct seq_file *m = p_arg;
2714 struct neighbour *n;
2716 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2718 #ifdef CONFIG_IPV6_SUBTREES
2719 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2721 seq_puts(m, "00000000000000000000000000000000 00 ");
2724 n = dst_get_neighbour_noref(&rt->dst);
2726 seq_printf(m, "%pi6", n->primary_key);
2728 seq_puts(m, "00000000000000000000000000000000");
2731 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2732 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2733 rt->dst.__use, rt->rt6i_flags,
2734 rt->dst.dev ? rt->dst.dev->name : "");
2738 static int ipv6_route_show(struct seq_file *m, void *v)
2740 struct net *net = (struct net *)m->private;
2741 fib6_clean_all_ro(net, rt6_info_route, 0, m);
2745 static int ipv6_route_open(struct inode *inode, struct file *file)
2747 return single_open_net(inode, file, ipv6_route_show);
2750 static const struct file_operations ipv6_route_proc_fops = {
2751 .owner = THIS_MODULE,
2752 .open = ipv6_route_open,
2754 .llseek = seq_lseek,
2755 .release = single_release_net,
2758 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2760 struct net *net = (struct net *)seq->private;
2761 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2762 net->ipv6.rt6_stats->fib_nodes,
2763 net->ipv6.rt6_stats->fib_route_nodes,
2764 net->ipv6.rt6_stats->fib_rt_alloc,
2765 net->ipv6.rt6_stats->fib_rt_entries,
2766 net->ipv6.rt6_stats->fib_rt_cache,
2767 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2768 net->ipv6.rt6_stats->fib_discarded_routes);
2773 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2775 return single_open_net(inode, file, rt6_stats_seq_show);
2778 static const struct file_operations rt6_stats_seq_fops = {
2779 .owner = THIS_MODULE,
2780 .open = rt6_stats_seq_open,
2782 .llseek = seq_lseek,
2783 .release = single_release_net,
2785 #endif /* CONFIG_PROC_FS */
2787 #ifdef CONFIG_SYSCTL
2790 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2791 void __user *buffer, size_t *lenp, loff_t *ppos)
2798 net = (struct net *)ctl->extra1;
2799 delay = net->ipv6.sysctl.flush_delay;
2800 proc_dointvec(ctl, write, buffer, lenp, ppos);
2801 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2805 ctl_table ipv6_route_table_template[] = {
2807 .procname = "flush",
2808 .data = &init_net.ipv6.sysctl.flush_delay,
2809 .maxlen = sizeof(int),
2811 .proc_handler = ipv6_sysctl_rtcache_flush
2814 .procname = "gc_thresh",
2815 .data = &ip6_dst_ops_template.gc_thresh,
2816 .maxlen = sizeof(int),
2818 .proc_handler = proc_dointvec,
2821 .procname = "max_size",
2822 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2823 .maxlen = sizeof(int),
2825 .proc_handler = proc_dointvec,
2828 .procname = "gc_min_interval",
2829 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2830 .maxlen = sizeof(int),
2832 .proc_handler = proc_dointvec_jiffies,
2835 .procname = "gc_timeout",
2836 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2837 .maxlen = sizeof(int),
2839 .proc_handler = proc_dointvec_jiffies,
2842 .procname = "gc_interval",
2843 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2844 .maxlen = sizeof(int),
2846 .proc_handler = proc_dointvec_jiffies,
2849 .procname = "gc_elasticity",
2850 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2851 .maxlen = sizeof(int),
2853 .proc_handler = proc_dointvec,
2856 .procname = "mtu_expires",
2857 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2858 .maxlen = sizeof(int),
2860 .proc_handler = proc_dointvec_jiffies,
2863 .procname = "min_adv_mss",
2864 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2865 .maxlen = sizeof(int),
2867 .proc_handler = proc_dointvec,
2870 .procname = "gc_min_interval_ms",
2871 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2872 .maxlen = sizeof(int),
2874 .proc_handler = proc_dointvec_ms_jiffies,
2879 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2881 struct ctl_table *table;
2883 table = kmemdup(ipv6_route_table_template,
2884 sizeof(ipv6_route_table_template),
2888 table[0].data = &net->ipv6.sysctl.flush_delay;
2889 table[0].extra1 = net;
2890 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2891 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2892 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2893 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2894 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2895 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2896 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2897 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2898 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2905 static int __net_init ip6_route_net_init(struct net *net)
2909 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2910 sizeof(net->ipv6.ip6_dst_ops));
2912 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2913 goto out_ip6_dst_ops;
2915 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2916 sizeof(*net->ipv6.ip6_null_entry),
2918 if (!net->ipv6.ip6_null_entry)
2919 goto out_ip6_dst_entries;
2920 net->ipv6.ip6_null_entry->dst.path =
2921 (struct dst_entry *)net->ipv6.ip6_null_entry;
2922 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2923 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2924 ip6_template_metrics, true);
2926 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2927 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2928 sizeof(*net->ipv6.ip6_prohibit_entry),
2930 if (!net->ipv6.ip6_prohibit_entry)
2931 goto out_ip6_null_entry;
2932 net->ipv6.ip6_prohibit_entry->dst.path =
2933 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2934 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2935 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2936 ip6_template_metrics, true);
2938 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2939 sizeof(*net->ipv6.ip6_blk_hole_entry),
2941 if (!net->ipv6.ip6_blk_hole_entry)
2942 goto out_ip6_prohibit_entry;
2943 net->ipv6.ip6_blk_hole_entry->dst.path =
2944 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2945 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2946 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2947 ip6_template_metrics, true);
2950 net->ipv6.sysctl.flush_delay = 0;
2951 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2952 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2953 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2954 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2955 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2956 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2957 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2959 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2965 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2966 out_ip6_prohibit_entry:
2967 kfree(net->ipv6.ip6_prohibit_entry);
2969 kfree(net->ipv6.ip6_null_entry);
2971 out_ip6_dst_entries:
2972 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2977 static void __net_exit ip6_route_net_exit(struct net *net)
2979 kfree(net->ipv6.ip6_null_entry);
2980 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2981 kfree(net->ipv6.ip6_prohibit_entry);
2982 kfree(net->ipv6.ip6_blk_hole_entry);
2984 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2987 static int __net_init ip6_route_net_init_late(struct net *net)
2989 #ifdef CONFIG_PROC_FS
2990 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2991 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2996 static void __net_exit ip6_route_net_exit_late(struct net *net)
2998 #ifdef CONFIG_PROC_FS
2999 proc_net_remove(net, "ipv6_route");
3000 proc_net_remove(net, "rt6_stats");
3004 static struct pernet_operations ip6_route_net_ops = {
3005 .init = ip6_route_net_init,
3006 .exit = ip6_route_net_exit,
3009 static struct pernet_operations ip6_route_net_late_ops = {
3010 .init = ip6_route_net_init_late,
3011 .exit = ip6_route_net_exit_late,
3014 static struct notifier_block ip6_route_dev_notifier = {
3015 .notifier_call = ip6_route_dev_notify,
3019 int __init ip6_route_init(void)
3024 ip6_dst_ops_template.kmem_cachep =
3025 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3026 SLAB_HWCACHE_ALIGN, NULL);
3027 if (!ip6_dst_ops_template.kmem_cachep)
3030 ret = dst_entries_init(&ip6_dst_blackhole_ops);
3032 goto out_kmem_cache;
3034 ret = register_pernet_subsys(&ip6_route_net_ops);
3036 goto out_dst_entries;
3038 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3040 /* Registering of the loopback is done before this portion of code,
3041 * the loopback reference in rt6_info will not be taken, do it
3042 * manually for init_net */
3043 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3044 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3045 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3046 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3047 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3048 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3049 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3053 goto out_register_subsys;
3059 ret = fib6_rules_init();
3063 ret = register_pernet_subsys(&ip6_route_net_late_ops);
3065 goto fib6_rules_init;
3068 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3069 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3070 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3071 goto out_register_late_subsys;
3073 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3075 goto out_register_late_subsys;
3080 out_register_late_subsys:
3081 unregister_pernet_subsys(&ip6_route_net_late_ops);
3083 fib6_rules_cleanup();
3088 out_register_subsys:
3089 unregister_pernet_subsys(&ip6_route_net_ops);
3091 dst_entries_destroy(&ip6_dst_blackhole_ops);
3093 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3097 void ip6_route_cleanup(void)
3099 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3100 unregister_pernet_subsys(&ip6_route_net_late_ops);
3101 fib6_rules_cleanup();
3104 unregister_pernet_subsys(&ip6_route_net_ops);
3105 dst_entries_destroy(&ip6_dst_blackhole_ops);
3106 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);