2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
53 #include <linux/rtnetlink.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
59 #include <asm/uaccess.h>
62 #include <linux/sysctl.h>
65 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
66 const struct in6_addr *dest);
67 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
68 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
69 static unsigned int ip6_mtu(const struct dst_entry *dst);
70 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
71 static void ip6_dst_destroy(struct dst_entry *);
72 static void ip6_dst_ifdown(struct dst_entry *,
73 struct net_device *dev, int how);
74 static int ip6_dst_gc(struct dst_ops *ops);
76 static int ip6_pkt_discard(struct sk_buff *skb);
77 static int ip6_pkt_discard_out(struct sk_buff *skb);
78 static void ip6_link_failure(struct sk_buff *skb);
79 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
81 #ifdef CONFIG_IPV6_ROUTE_INFO
82 static struct rt6_info *rt6_add_route_info(struct net *net,
83 const struct in6_addr *prefix, int prefixlen,
84 const struct in6_addr *gwaddr, int ifindex,
86 static struct rt6_info *rt6_get_route_info(struct net *net,
87 const struct in6_addr *prefix, int prefixlen,
88 const struct in6_addr *gwaddr, int ifindex);
91 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
93 struct rt6_info *rt = (struct rt6_info *) dst;
94 struct inet_peer *peer;
97 if (!(rt->dst.flags & DST_HOST))
101 rt6_bind_peer(rt, 1);
103 peer = rt->rt6i_peer;
105 u32 *old_p = __DST_METRICS_PTR(old);
106 unsigned long prev, new;
109 if (inet_metrics_new(peer))
110 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
112 new = (unsigned long) p;
113 prev = cmpxchg(&dst->_metrics, old, new);
116 p = __DST_METRICS_PTR(prev);
117 if (prev & DST_METRICS_READ_ONLY)
124 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
126 struct in6_addr *p = &rt->rt6i_gateway;
128 if (!ipv6_addr_any(p))
129 return (const void *) p;
133 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
135 struct rt6_info *rt = (struct rt6_info *) dst;
138 daddr = choose_neigh_daddr(rt, daddr);
139 n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
142 return neigh_create(&nd_tbl, daddr, dst->dev);
145 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
147 struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
149 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
153 dst_set_neighbour(&rt->dst, n);
158 static struct dst_ops ip6_dst_ops_template = {
160 .protocol = cpu_to_be16(ETH_P_IPV6),
163 .check = ip6_dst_check,
164 .default_advmss = ip6_default_advmss,
166 .cow_metrics = ipv6_cow_metrics,
167 .destroy = ip6_dst_destroy,
168 .ifdown = ip6_dst_ifdown,
169 .negative_advice = ip6_negative_advice,
170 .link_failure = ip6_link_failure,
171 .update_pmtu = ip6_rt_update_pmtu,
172 .local_out = __ip6_local_out,
173 .neigh_lookup = ip6_neigh_lookup,
176 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
178 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
180 return mtu ? : dst->dev->mtu;
183 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
187 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
193 static struct dst_ops ip6_dst_blackhole_ops = {
195 .protocol = cpu_to_be16(ETH_P_IPV6),
196 .destroy = ip6_dst_destroy,
197 .check = ip6_dst_check,
198 .mtu = ip6_blackhole_mtu,
199 .default_advmss = ip6_default_advmss,
200 .update_pmtu = ip6_rt_blackhole_update_pmtu,
201 .cow_metrics = ip6_rt_blackhole_cow_metrics,
202 .neigh_lookup = ip6_neigh_lookup,
205 static const u32 ip6_template_metrics[RTAX_MAX] = {
206 [RTAX_HOPLIMIT - 1] = 0,
209 static struct rt6_info ip6_null_entry_template = {
211 .__refcnt = ATOMIC_INIT(1),
214 .error = -ENETUNREACH,
215 .input = ip6_pkt_discard,
216 .output = ip6_pkt_discard_out,
218 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
219 .rt6i_protocol = RTPROT_KERNEL,
220 .rt6i_metric = ~(u32) 0,
221 .rt6i_ref = ATOMIC_INIT(1),
224 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
226 static int ip6_pkt_prohibit(struct sk_buff *skb);
227 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
229 static struct rt6_info ip6_prohibit_entry_template = {
231 .__refcnt = ATOMIC_INIT(1),
235 .input = ip6_pkt_prohibit,
236 .output = ip6_pkt_prohibit_out,
238 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
239 .rt6i_protocol = RTPROT_KERNEL,
240 .rt6i_metric = ~(u32) 0,
241 .rt6i_ref = ATOMIC_INIT(1),
244 static struct rt6_info ip6_blk_hole_entry_template = {
246 .__refcnt = ATOMIC_INIT(1),
250 .input = dst_discard,
251 .output = dst_discard,
253 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
254 .rt6i_protocol = RTPROT_KERNEL,
255 .rt6i_metric = ~(u32) 0,
256 .rt6i_ref = ATOMIC_INIT(1),
261 /* allocate dst with ip6_dst_ops */
262 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
263 struct net_device *dev,
266 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
269 memset(&rt->rt6i_table, 0,
270 sizeof(*rt) - sizeof(struct dst_entry));
275 static void ip6_dst_destroy(struct dst_entry *dst)
277 struct rt6_info *rt = (struct rt6_info *)dst;
278 struct inet6_dev *idev = rt->rt6i_idev;
279 struct inet_peer *peer = rt->rt6i_peer;
281 if (!(rt->dst.flags & DST_HOST))
282 dst_destroy_metrics_generic(dst);
285 rt->rt6i_idev = NULL;
289 if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
290 dst_release(dst->from);
293 rt->rt6i_peer = NULL;
298 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
300 static u32 rt6_peer_genid(void)
302 return atomic_read(&__rt6_peer_genid);
305 void rt6_bind_peer(struct rt6_info *rt, int create)
307 struct inet_peer *peer;
309 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
310 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
313 rt->rt6i_peer_genid = rt6_peer_genid();
316 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
319 struct rt6_info *rt = (struct rt6_info *)dst;
320 struct inet6_dev *idev = rt->rt6i_idev;
321 struct net_device *loopback_dev =
322 dev_net(dev)->loopback_dev;
324 if (dev != loopback_dev && idev && idev->dev == dev) {
325 struct inet6_dev *loopback_idev =
326 in6_dev_get(loopback_dev);
328 rt->rt6i_idev = loopback_idev;
334 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
336 struct rt6_info *ort = NULL;
338 if (rt->rt6i_flags & RTF_EXPIRES) {
339 if (time_after(jiffies, rt->dst.expires))
341 } else if (rt->dst.from) {
342 ort = (struct rt6_info *) rt->dst.from;
343 return (ort->rt6i_flags & RTF_EXPIRES) &&
344 time_after(jiffies, ort->dst.expires);
349 static inline int rt6_need_strict(const struct in6_addr *daddr)
351 return ipv6_addr_type(daddr) &
352 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
356 * Route lookup. Any table->tb6_lock is implied.
359 static inline struct rt6_info *rt6_device_match(struct net *net,
361 const struct in6_addr *saddr,
365 struct rt6_info *local = NULL;
366 struct rt6_info *sprt;
368 if (!oif && ipv6_addr_any(saddr))
371 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
372 struct net_device *dev = sprt->dst.dev;
375 if (dev->ifindex == oif)
377 if (dev->flags & IFF_LOOPBACK) {
378 if (!sprt->rt6i_idev ||
379 sprt->rt6i_idev->dev->ifindex != oif) {
380 if (flags & RT6_LOOKUP_F_IFACE && oif)
382 if (local && (!oif ||
383 local->rt6i_idev->dev->ifindex == oif))
389 if (ipv6_chk_addr(net, saddr, dev,
390 flags & RT6_LOOKUP_F_IFACE))
399 if (flags & RT6_LOOKUP_F_IFACE)
400 return net->ipv6.ip6_null_entry;
406 #ifdef CONFIG_IPV6_ROUTER_PREF
407 static void rt6_probe(struct rt6_info *rt)
409 struct neighbour *neigh;
411 * Okay, this does not seem to be appropriate
412 * for now, however, we need to check if it
413 * is really so; aka Router Reachability Probing.
415 * Router Reachability Probe MUST be rate-limited
416 * to no more than one per minute.
419 neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
420 if (!neigh || (neigh->nud_state & NUD_VALID))
422 read_lock_bh(&neigh->lock);
423 if (!(neigh->nud_state & NUD_VALID) &&
424 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
425 struct in6_addr mcaddr;
426 struct in6_addr *target;
428 neigh->updated = jiffies;
429 read_unlock_bh(&neigh->lock);
431 target = (struct in6_addr *)&neigh->primary_key;
432 addrconf_addr_solict_mult(target, &mcaddr);
433 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
435 read_unlock_bh(&neigh->lock);
441 static inline void rt6_probe(struct rt6_info *rt)
447 * Default Router Selection (RFC 2461 6.3.6)
449 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
451 struct net_device *dev = rt->dst.dev;
452 if (!oif || dev->ifindex == oif)
454 if ((dev->flags & IFF_LOOPBACK) &&
455 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
460 static inline int rt6_check_neigh(struct rt6_info *rt)
462 struct neighbour *neigh;
466 neigh = dst_get_neighbour_noref(&rt->dst);
467 if (rt->rt6i_flags & RTF_NONEXTHOP ||
468 !(rt->rt6i_flags & RTF_GATEWAY))
471 read_lock_bh(&neigh->lock);
472 if (neigh->nud_state & NUD_VALID)
474 #ifdef CONFIG_IPV6_ROUTER_PREF
475 else if (neigh->nud_state & NUD_FAILED)
480 read_unlock_bh(&neigh->lock);
487 static int rt6_score_route(struct rt6_info *rt, int oif,
492 m = rt6_check_dev(rt, oif);
493 if (!m && (strict & RT6_LOOKUP_F_IFACE))
495 #ifdef CONFIG_IPV6_ROUTER_PREF
496 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
498 n = rt6_check_neigh(rt);
499 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
504 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
505 int *mpri, struct rt6_info *match)
509 if (rt6_check_expired(rt))
512 m = rt6_score_route(rt, oif, strict);
517 if (strict & RT6_LOOKUP_F_REACHABLE)
521 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
529 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
530 struct rt6_info *rr_head,
531 u32 metric, int oif, int strict)
533 struct rt6_info *rt, *match;
537 for (rt = rr_head; rt && rt->rt6i_metric == metric;
538 rt = rt->dst.rt6_next)
539 match = find_match(rt, oif, strict, &mpri, match);
540 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
541 rt = rt->dst.rt6_next)
542 match = find_match(rt, oif, strict, &mpri, match);
547 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
549 struct rt6_info *match, *rt0;
554 fn->rr_ptr = rt0 = fn->leaf;
556 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
559 (strict & RT6_LOOKUP_F_REACHABLE)) {
560 struct rt6_info *next = rt0->dst.rt6_next;
562 /* no entries matched; do round-robin */
563 if (!next || next->rt6i_metric != rt0->rt6i_metric)
570 net = dev_net(rt0->dst.dev);
571 return match ? match : net->ipv6.ip6_null_entry;
574 #ifdef CONFIG_IPV6_ROUTE_INFO
575 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
576 const struct in6_addr *gwaddr)
578 struct net *net = dev_net(dev);
579 struct route_info *rinfo = (struct route_info *) opt;
580 struct in6_addr prefix_buf, *prefix;
582 unsigned long lifetime;
585 if (len < sizeof(struct route_info)) {
589 /* Sanity check for prefix_len and length */
590 if (rinfo->length > 3) {
592 } else if (rinfo->prefix_len > 128) {
594 } else if (rinfo->prefix_len > 64) {
595 if (rinfo->length < 2) {
598 } else if (rinfo->prefix_len > 0) {
599 if (rinfo->length < 1) {
604 pref = rinfo->route_pref;
605 if (pref == ICMPV6_ROUTER_PREF_INVALID)
608 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
610 if (rinfo->length == 3)
611 prefix = (struct in6_addr *)rinfo->prefix;
613 /* this function is safe */
614 ipv6_addr_prefix(&prefix_buf,
615 (struct in6_addr *)rinfo->prefix,
617 prefix = &prefix_buf;
620 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
623 if (rt && !lifetime) {
629 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
632 rt->rt6i_flags = RTF_ROUTEINFO |
633 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
636 if (!addrconf_finite_timeout(lifetime))
637 rt6_clean_expires(rt);
639 rt6_set_expires(rt, jiffies + HZ * lifetime);
641 dst_release(&rt->dst);
647 #define BACKTRACK(__net, saddr) \
649 if (rt == __net->ipv6.ip6_null_entry) { \
650 struct fib6_node *pn; \
652 if (fn->fn_flags & RTN_TL_ROOT) \
655 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
656 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
659 if (fn->fn_flags & RTN_RTINFO) \
665 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
666 struct fib6_table *table,
667 struct flowi6 *fl6, int flags)
669 struct fib6_node *fn;
672 read_lock_bh(&table->tb6_lock);
673 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
676 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
677 BACKTRACK(net, &fl6->saddr);
679 dst_use(&rt->dst, jiffies);
680 read_unlock_bh(&table->tb6_lock);
685 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
688 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
690 EXPORT_SYMBOL_GPL(ip6_route_lookup);
692 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
693 const struct in6_addr *saddr, int oif, int strict)
695 struct flowi6 fl6 = {
699 struct dst_entry *dst;
700 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
703 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
704 flags |= RT6_LOOKUP_F_HAS_SADDR;
707 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
709 return (struct rt6_info *) dst;
716 EXPORT_SYMBOL(rt6_lookup);
718 /* ip6_ins_rt is called with FREE table->tb6_lock.
719 It takes new route entry, the addition fails by any reason the
720 route is freed. In any case, if caller does not hold it, it may
724 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
727 struct fib6_table *table;
729 table = rt->rt6i_table;
730 write_lock_bh(&table->tb6_lock);
731 err = fib6_add(&table->tb6_root, rt, info);
732 write_unlock_bh(&table->tb6_lock);
737 int ip6_ins_rt(struct rt6_info *rt)
739 struct nl_info info = {
740 .nl_net = dev_net(rt->dst.dev),
742 return __ip6_ins_rt(rt, &info);
745 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
746 const struct in6_addr *daddr,
747 const struct in6_addr *saddr)
755 rt = ip6_rt_copy(ort, daddr);
758 int attempts = !in_softirq();
760 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
761 if (ort->rt6i_dst.plen != 128 &&
762 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
763 rt->rt6i_flags |= RTF_ANYCAST;
764 rt->rt6i_gateway = *daddr;
767 rt->rt6i_flags |= RTF_CACHE;
769 #ifdef CONFIG_IPV6_SUBTREES
770 if (rt->rt6i_src.plen && saddr) {
771 rt->rt6i_src.addr = *saddr;
772 rt->rt6i_src.plen = 128;
777 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
778 struct net *net = dev_net(rt->dst.dev);
779 int saved_rt_min_interval =
780 net->ipv6.sysctl.ip6_rt_gc_min_interval;
781 int saved_rt_elasticity =
782 net->ipv6.sysctl.ip6_rt_gc_elasticity;
784 if (attempts-- > 0) {
785 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
786 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
788 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
790 net->ipv6.sysctl.ip6_rt_gc_elasticity =
792 net->ipv6.sysctl.ip6_rt_gc_min_interval =
793 saved_rt_min_interval;
799 "ipv6: Neighbour table overflow.\n");
808 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
809 const struct in6_addr *daddr)
811 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
814 rt->rt6i_flags |= RTF_CACHE;
815 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
820 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
821 struct flowi6 *fl6, int flags)
823 struct fib6_node *fn;
824 struct rt6_info *rt, *nrt;
828 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
830 strict |= flags & RT6_LOOKUP_F_IFACE;
833 read_lock_bh(&table->tb6_lock);
836 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
839 rt = rt6_select(fn, oif, strict | reachable);
841 BACKTRACK(net, &fl6->saddr);
842 if (rt == net->ipv6.ip6_null_entry ||
843 rt->rt6i_flags & RTF_CACHE)
847 read_unlock_bh(&table->tb6_lock);
849 if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
850 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
851 else if (!(rt->dst.flags & DST_HOST))
852 nrt = rt6_alloc_clone(rt, &fl6->daddr);
856 dst_release(&rt->dst);
857 rt = nrt ? : net->ipv6.ip6_null_entry;
861 err = ip6_ins_rt(nrt);
870 * Race condition! In the gap, when table->tb6_lock was
871 * released someone could insert this route. Relookup.
873 dst_release(&rt->dst);
882 read_unlock_bh(&table->tb6_lock);
884 rt->dst.lastuse = jiffies;
890 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
891 struct flowi6 *fl6, int flags)
893 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
896 static struct dst_entry *ip6_route_input_lookup(struct net *net,
897 struct net_device *dev,
898 struct flowi6 *fl6, int flags)
900 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
901 flags |= RT6_LOOKUP_F_IFACE;
903 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
906 void ip6_route_input(struct sk_buff *skb)
908 const struct ipv6hdr *iph = ipv6_hdr(skb);
909 struct net *net = dev_net(skb->dev);
910 int flags = RT6_LOOKUP_F_HAS_SADDR;
911 struct flowi6 fl6 = {
912 .flowi6_iif = skb->dev->ifindex,
915 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
916 .flowi6_mark = skb->mark,
917 .flowi6_proto = iph->nexthdr,
920 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
923 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
924 struct flowi6 *fl6, int flags)
926 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
929 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
934 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
935 flags |= RT6_LOOKUP_F_IFACE;
937 if (!ipv6_addr_any(&fl6->saddr))
938 flags |= RT6_LOOKUP_F_HAS_SADDR;
940 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
942 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
945 EXPORT_SYMBOL(ip6_route_output);
947 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
949 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
950 struct dst_entry *new = NULL;
952 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
954 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
959 new->input = dst_discard;
960 new->output = dst_discard;
962 if (dst_metrics_read_only(&ort->dst))
963 new->_metrics = ort->dst._metrics;
965 dst_copy_metrics(new, &ort->dst);
966 rt->rt6i_idev = ort->rt6i_idev;
968 in6_dev_hold(rt->rt6i_idev);
970 rt->rt6i_gateway = ort->rt6i_gateway;
971 rt->rt6i_flags = ort->rt6i_flags;
972 rt6_clean_expires(rt);
975 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
976 #ifdef CONFIG_IPV6_SUBTREES
977 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
983 dst_release(dst_orig);
984 return new ? new : ERR_PTR(-ENOMEM);
988 * Destination cache support functions
991 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
995 rt = (struct rt6_info *) dst;
997 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
998 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1000 rt6_bind_peer(rt, 0);
1001 rt->rt6i_peer_genid = rt6_peer_genid();
1008 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1010 struct rt6_info *rt = (struct rt6_info *) dst;
1013 if (rt->rt6i_flags & RTF_CACHE) {
1014 if (rt6_check_expired(rt)) {
1026 static void ip6_link_failure(struct sk_buff *skb)
1028 struct rt6_info *rt;
1030 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1032 rt = (struct rt6_info *) skb_dst(skb);
1034 if (rt->rt6i_flags & RTF_CACHE)
1035 rt6_update_expires(rt, 0);
1036 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1037 rt->rt6i_node->fn_sernum = -1;
1041 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1043 struct rt6_info *rt6 = (struct rt6_info*)dst;
1045 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1046 rt6->rt6i_flags |= RTF_MODIFIED;
1047 if (mtu < IPV6_MIN_MTU) {
1048 u32 features = dst_metric(dst, RTAX_FEATURES);
1050 features |= RTAX_FEATURE_ALLFRAG;
1051 dst_metric_set(dst, RTAX_FEATURES, features);
1053 dst_metric_set(dst, RTAX_MTU, mtu);
1057 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1059 struct net_device *dev = dst->dev;
1060 unsigned int mtu = dst_mtu(dst);
1061 struct net *net = dev_net(dev);
1063 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1065 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1066 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1069 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1070 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1071 * IPV6_MAXPLEN is also valid and means: "any MSS,
1072 * rely only on pmtu discovery"
1074 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1079 static unsigned int ip6_mtu(const struct dst_entry *dst)
1081 struct inet6_dev *idev;
1082 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1090 idev = __in6_dev_get(dst->dev);
1092 mtu = idev->cnf.mtu6;
1098 static struct dst_entry *icmp6_dst_gc_list;
1099 static DEFINE_SPINLOCK(icmp6_dst_lock);
1101 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1102 struct neighbour *neigh,
1105 struct dst_entry *dst;
1106 struct rt6_info *rt;
1107 struct inet6_dev *idev = in6_dev_get(dev);
1108 struct net *net = dev_net(dev);
1110 if (unlikely(!idev))
1111 return ERR_PTR(-ENODEV);
1113 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1114 if (unlikely(!rt)) {
1116 dst = ERR_PTR(-ENOMEM);
1123 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1124 if (IS_ERR(neigh)) {
1127 return ERR_CAST(neigh);
1131 rt->dst.flags |= DST_HOST;
1132 rt->dst.output = ip6_output;
1133 dst_set_neighbour(&rt->dst, neigh);
1134 atomic_set(&rt->dst.__refcnt, 1);
1135 rt->rt6i_dst.addr = fl6->daddr;
1136 rt->rt6i_dst.plen = 128;
1137 rt->rt6i_idev = idev;
1138 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1140 spin_lock_bh(&icmp6_dst_lock);
1141 rt->dst.next = icmp6_dst_gc_list;
1142 icmp6_dst_gc_list = &rt->dst;
1143 spin_unlock_bh(&icmp6_dst_lock);
1145 fib6_force_start_gc(net);
1147 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1153 int icmp6_dst_gc(void)
1155 struct dst_entry *dst, **pprev;
1158 spin_lock_bh(&icmp6_dst_lock);
1159 pprev = &icmp6_dst_gc_list;
1161 while ((dst = *pprev) != NULL) {
1162 if (!atomic_read(&dst->__refcnt)) {
1171 spin_unlock_bh(&icmp6_dst_lock);
1176 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1179 struct dst_entry *dst, **pprev;
1181 spin_lock_bh(&icmp6_dst_lock);
1182 pprev = &icmp6_dst_gc_list;
1183 while ((dst = *pprev) != NULL) {
1184 struct rt6_info *rt = (struct rt6_info *) dst;
1185 if (func(rt, arg)) {
1192 spin_unlock_bh(&icmp6_dst_lock);
1195 static int ip6_dst_gc(struct dst_ops *ops)
1197 unsigned long now = jiffies;
1198 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1199 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1200 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1201 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1202 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1203 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1206 entries = dst_entries_get_fast(ops);
1207 if (time_after(rt_last_gc + rt_min_interval, now) &&
1208 entries <= rt_max_size)
1211 net->ipv6.ip6_rt_gc_expire++;
1212 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1213 net->ipv6.ip6_rt_last_gc = now;
1214 entries = dst_entries_get_slow(ops);
1215 if (entries < ops->gc_thresh)
1216 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1218 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1219 return entries > rt_max_size;
1222 /* Clean host part of a prefix. Not necessary in radix tree,
1223 but results in cleaner routing tables.
1225 Remove it only when all the things will work!
1228 int ip6_dst_hoplimit(struct dst_entry *dst)
1230 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1231 if (hoplimit == 0) {
1232 struct net_device *dev = dst->dev;
1233 struct inet6_dev *idev;
1236 idev = __in6_dev_get(dev);
1238 hoplimit = idev->cnf.hop_limit;
1240 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1245 EXPORT_SYMBOL(ip6_dst_hoplimit);
1251 int ip6_route_add(struct fib6_config *cfg)
1254 struct net *net = cfg->fc_nlinfo.nl_net;
1255 struct rt6_info *rt = NULL;
1256 struct net_device *dev = NULL;
1257 struct inet6_dev *idev = NULL;
1258 struct fib6_table *table;
1261 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1263 #ifndef CONFIG_IPV6_SUBTREES
1264 if (cfg->fc_src_len)
1267 if (cfg->fc_ifindex) {
1269 dev = dev_get_by_index(net, cfg->fc_ifindex);
1272 idev = in6_dev_get(dev);
1277 if (cfg->fc_metric == 0)
1278 cfg->fc_metric = IP6_RT_PRIO_USER;
1281 if (cfg->fc_nlinfo.nlh &&
1282 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1283 table = fib6_get_table(net, cfg->fc_table);
1285 printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1286 table = fib6_new_table(net, cfg->fc_table);
1289 table = fib6_new_table(net, cfg->fc_table);
1295 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1302 rt->dst.obsolete = -1;
1304 if (cfg->fc_flags & RTF_EXPIRES)
1305 rt6_set_expires(rt, jiffies +
1306 clock_t_to_jiffies(cfg->fc_expires));
1308 rt6_clean_expires(rt);
1310 if (cfg->fc_protocol == RTPROT_UNSPEC)
1311 cfg->fc_protocol = RTPROT_BOOT;
1312 rt->rt6i_protocol = cfg->fc_protocol;
1314 addr_type = ipv6_addr_type(&cfg->fc_dst);
1316 if (addr_type & IPV6_ADDR_MULTICAST)
1317 rt->dst.input = ip6_mc_input;
1318 else if (cfg->fc_flags & RTF_LOCAL)
1319 rt->dst.input = ip6_input;
1321 rt->dst.input = ip6_forward;
1323 rt->dst.output = ip6_output;
1325 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1326 rt->rt6i_dst.plen = cfg->fc_dst_len;
1327 if (rt->rt6i_dst.plen == 128)
1328 rt->dst.flags |= DST_HOST;
1330 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1331 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1336 dst_init_metrics(&rt->dst, metrics, 0);
1338 #ifdef CONFIG_IPV6_SUBTREES
1339 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1340 rt->rt6i_src.plen = cfg->fc_src_len;
1343 rt->rt6i_metric = cfg->fc_metric;
1345 /* We cannot add true routes via loopback here,
1346 they would result in kernel looping; promote them to reject routes
1348 if ((cfg->fc_flags & RTF_REJECT) ||
1349 (dev && (dev->flags & IFF_LOOPBACK) &&
1350 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1351 !(cfg->fc_flags & RTF_LOCAL))) {
1352 /* hold loopback dev/idev if we haven't done so. */
1353 if (dev != net->loopback_dev) {
1358 dev = net->loopback_dev;
1360 idev = in6_dev_get(dev);
1366 rt->dst.output = ip6_pkt_discard_out;
1367 rt->dst.input = ip6_pkt_discard;
1368 rt->dst.error = -ENETUNREACH;
1369 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1373 if (cfg->fc_flags & RTF_GATEWAY) {
1374 const struct in6_addr *gw_addr;
1377 gw_addr = &cfg->fc_gateway;
1378 rt->rt6i_gateway = *gw_addr;
1379 gwa_type = ipv6_addr_type(gw_addr);
1381 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1382 struct rt6_info *grt;
1384 /* IPv6 strictly inhibits using not link-local
1385 addresses as nexthop address.
1386 Otherwise, router will not able to send redirects.
1387 It is very good, but in some (rare!) circumstances
1388 (SIT, PtP, NBMA NOARP links) it is handy to allow
1389 some exceptions. --ANK
1392 if (!(gwa_type & IPV6_ADDR_UNICAST))
1395 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1397 err = -EHOSTUNREACH;
1401 if (dev != grt->dst.dev) {
1402 dst_release(&grt->dst);
1407 idev = grt->rt6i_idev;
1409 in6_dev_hold(grt->rt6i_idev);
1411 if (!(grt->rt6i_flags & RTF_GATEWAY))
1413 dst_release(&grt->dst);
1419 if (!dev || (dev->flags & IFF_LOOPBACK))
1427 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1428 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1432 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1433 rt->rt6i_prefsrc.plen = 128;
1435 rt->rt6i_prefsrc.plen = 0;
1437 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1438 err = rt6_bind_neighbour(rt, dev);
1443 rt->rt6i_flags = cfg->fc_flags;
1450 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1451 int type = nla_type(nla);
1454 if (type > RTAX_MAX) {
1459 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1465 rt->rt6i_idev = idev;
1466 rt->rt6i_table = table;
1468 cfg->fc_nlinfo.nl_net = dev_net(dev);
1470 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1482 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1485 struct fib6_table *table;
1486 struct net *net = dev_net(rt->dst.dev);
1488 if (rt == net->ipv6.ip6_null_entry) {
1493 table = rt->rt6i_table;
1494 write_lock_bh(&table->tb6_lock);
1495 err = fib6_del(rt, info);
1496 write_unlock_bh(&table->tb6_lock);
1499 dst_release(&rt->dst);
1503 int ip6_del_rt(struct rt6_info *rt)
1505 struct nl_info info = {
1506 .nl_net = dev_net(rt->dst.dev),
1508 return __ip6_del_rt(rt, &info);
1511 static int ip6_route_del(struct fib6_config *cfg)
1513 struct fib6_table *table;
1514 struct fib6_node *fn;
1515 struct rt6_info *rt;
1518 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1522 read_lock_bh(&table->tb6_lock);
1524 fn = fib6_locate(&table->tb6_root,
1525 &cfg->fc_dst, cfg->fc_dst_len,
1526 &cfg->fc_src, cfg->fc_src_len);
1529 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1530 if (cfg->fc_ifindex &&
1532 rt->dst.dev->ifindex != cfg->fc_ifindex))
1534 if (cfg->fc_flags & RTF_GATEWAY &&
1535 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1537 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1540 read_unlock_bh(&table->tb6_lock);
1542 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1545 read_unlock_bh(&table->tb6_lock);
1553 struct ip6rd_flowi {
1555 struct in6_addr gateway;
1558 static struct rt6_info *__ip6_route_redirect(struct net *net,
1559 struct fib6_table *table,
1563 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1564 struct rt6_info *rt;
1565 struct fib6_node *fn;
1568 * Get the "current" route for this destination and
1569 * check if the redirect has come from approriate router.
1571 * RFC 2461 specifies that redirects should only be
1572 * accepted if they come from the nexthop to the target.
1573 * Due to the way the routes are chosen, this notion
1574 * is a bit fuzzy and one might need to check all possible
1578 read_lock_bh(&table->tb6_lock);
1579 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1581 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1583 * Current route is on-link; redirect is always invalid.
1585 * Seems, previous statement is not true. It could
1586 * be node, which looks for us as on-link (f.e. proxy ndisc)
1587 * But then router serving it might decide, that we should
1588 * know truth 8)8) --ANK (980726).
1590 if (rt6_check_expired(rt))
1592 if (!(rt->rt6i_flags & RTF_GATEWAY))
1594 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1596 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1602 rt = net->ipv6.ip6_null_entry;
1603 BACKTRACK(net, &fl6->saddr);
1607 read_unlock_bh(&table->tb6_lock);
1612 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1613 const struct in6_addr *src,
1614 const struct in6_addr *gateway,
1615 struct net_device *dev)
1617 int flags = RT6_LOOKUP_F_HAS_SADDR;
1618 struct net *net = dev_net(dev);
1619 struct ip6rd_flowi rdfl = {
1621 .flowi6_oif = dev->ifindex,
1627 rdfl.gateway = *gateway;
1629 if (rt6_need_strict(dest))
1630 flags |= RT6_LOOKUP_F_IFACE;
1632 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1633 flags, __ip6_route_redirect);
1636 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1637 const struct in6_addr *saddr,
1638 struct neighbour *neigh, u8 *lladdr, int on_link)
1640 struct rt6_info *rt, *nrt = NULL;
1641 struct netevent_redirect netevent;
1642 struct net *net = dev_net(neigh->dev);
1644 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1646 if (rt == net->ipv6.ip6_null_entry) {
1647 if (net_ratelimit())
1648 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1649 "for redirect target\n");
1654 * We have finally decided to accept it.
1657 neigh_update(neigh, lladdr, NUD_STALE,
1658 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1659 NEIGH_UPDATE_F_OVERRIDE|
1660 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1661 NEIGH_UPDATE_F_ISROUTER))
1665 * Redirect received -> path was valid.
1666 * Look, redirects are sent only in response to data packets,
1667 * so that this nexthop apparently is reachable. --ANK
1669 dst_confirm(&rt->dst);
1671 /* Duplicate redirect: silently ignore. */
1672 if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1675 nrt = ip6_rt_copy(rt, dest);
1679 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1681 nrt->rt6i_flags &= ~RTF_GATEWAY;
1683 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1684 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1686 if (ip6_ins_rt(nrt))
1689 netevent.old = &rt->dst;
1690 netevent.new = &nrt->dst;
1691 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1693 if (rt->rt6i_flags & RTF_CACHE) {
1699 dst_release(&rt->dst);
1703 * Handle ICMP "packet too big" messages
1704 * i.e. Path MTU discovery
1707 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1708 struct net *net, u32 pmtu, int ifindex)
1710 struct rt6_info *rt, *nrt;
1713 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1717 if (rt6_check_expired(rt)) {
1722 if (pmtu >= dst_mtu(&rt->dst))
1725 if (pmtu < IPV6_MIN_MTU) {
1727 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1728 * MTU (1280) and a fragment header should always be included
1729 * after a node receiving Too Big message reporting PMTU is
1730 * less than the IPv6 Minimum Link MTU.
1732 pmtu = IPV6_MIN_MTU;
1736 /* New mtu received -> path was valid.
1737 They are sent only in response to data packets,
1738 so that this nexthop apparently is reachable. --ANK
1740 dst_confirm(&rt->dst);
1742 /* Host route. If it is static, it would be better
1743 not to override it, but add new one, so that
1744 when cache entry will expire old pmtu
1745 would return automatically.
1747 if (rt->rt6i_flags & RTF_CACHE) {
1748 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1750 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1751 features |= RTAX_FEATURE_ALLFRAG;
1752 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1754 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1755 rt->rt6i_flags |= RTF_MODIFIED;
1760 Two cases are possible:
1761 1. It is connected route. Action: COW
1762 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1764 if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1765 nrt = rt6_alloc_cow(rt, daddr, saddr);
1767 nrt = rt6_alloc_clone(rt, daddr);
1770 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1772 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1773 features |= RTAX_FEATURE_ALLFRAG;
1774 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1777 /* According to RFC 1981, detecting PMTU increase shouldn't be
1778 * happened within 5 mins, the recommended timer is 10 mins.
1779 * Here this route expiration time is set to ip6_rt_mtu_expires
1780 * which is 10 mins. After 10 mins the decreased pmtu is expired
1781 * and detecting PMTU increase will be automatically happened.
1783 rt6_update_expires(nrt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1784 nrt->rt6i_flags |= RTF_DYNAMIC;
1788 dst_release(&rt->dst);
1791 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1792 struct net_device *dev, u32 pmtu)
1794 struct net *net = dev_net(dev);
1797 * RFC 1981 states that a node "MUST reduce the size of the packets it
1798 * is sending along the path" that caused the Packet Too Big message.
1799 * Since it's not possible in the general case to determine which
1800 * interface was used to send the original packet, we update the MTU
1801 * on the interface that will be used to send future packets. We also
1802 * update the MTU on the interface that received the Packet Too Big in
1803 * case the original packet was forced out that interface with
1804 * SO_BINDTODEVICE or similar. This is the next best thing to the
1805 * correct behaviour, which would be to update the MTU on all
1808 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1809 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1813 * Misc support functions
1816 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1817 const struct in6_addr *dest)
1819 struct net *net = dev_net(ort->dst.dev);
1820 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1824 rt->dst.input = ort->dst.input;
1825 rt->dst.output = ort->dst.output;
1826 rt->dst.flags |= DST_HOST;
1828 rt->rt6i_dst.addr = *dest;
1829 rt->rt6i_dst.plen = 128;
1830 dst_copy_metrics(&rt->dst, &ort->dst);
1831 rt->dst.error = ort->dst.error;
1832 rt->rt6i_idev = ort->rt6i_idev;
1834 in6_dev_hold(rt->rt6i_idev);
1835 rt->dst.lastuse = jiffies;
1837 rt->rt6i_gateway = ort->rt6i_gateway;
1838 rt->rt6i_flags = ort->rt6i_flags;
1839 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1840 (RTF_DEFAULT | RTF_ADDRCONF))
1841 rt6_set_from(rt, ort);
1843 rt6_clean_expires(rt);
1844 rt->rt6i_metric = 0;
1846 #ifdef CONFIG_IPV6_SUBTREES
1847 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1849 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1850 rt->rt6i_table = ort->rt6i_table;
1855 #ifdef CONFIG_IPV6_ROUTE_INFO
1856 static struct rt6_info *rt6_get_route_info(struct net *net,
1857 const struct in6_addr *prefix, int prefixlen,
1858 const struct in6_addr *gwaddr, int ifindex)
1860 struct fib6_node *fn;
1861 struct rt6_info *rt = NULL;
1862 struct fib6_table *table;
1864 table = fib6_get_table(net, RT6_TABLE_INFO);
1868 write_lock_bh(&table->tb6_lock);
1869 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1873 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1874 if (rt->dst.dev->ifindex != ifindex)
1876 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1878 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1884 write_unlock_bh(&table->tb6_lock);
1888 static struct rt6_info *rt6_add_route_info(struct net *net,
1889 const struct in6_addr *prefix, int prefixlen,
1890 const struct in6_addr *gwaddr, int ifindex,
1893 struct fib6_config cfg = {
1894 .fc_table = RT6_TABLE_INFO,
1895 .fc_metric = IP6_RT_PRIO_USER,
1896 .fc_ifindex = ifindex,
1897 .fc_dst_len = prefixlen,
1898 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1899 RTF_UP | RTF_PREF(pref),
1901 .fc_nlinfo.nlh = NULL,
1902 .fc_nlinfo.nl_net = net,
1905 cfg.fc_dst = *prefix;
1906 cfg.fc_gateway = *gwaddr;
1908 /* We should treat it as a default route if prefix length is 0. */
1910 cfg.fc_flags |= RTF_DEFAULT;
1912 ip6_route_add(&cfg);
1914 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1918 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1920 struct rt6_info *rt;
1921 struct fib6_table *table;
1923 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1927 write_lock_bh(&table->tb6_lock);
1928 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1929 if (dev == rt->dst.dev &&
1930 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1931 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1936 write_unlock_bh(&table->tb6_lock);
1940 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1941 struct net_device *dev,
1944 struct fib6_config cfg = {
1945 .fc_table = RT6_TABLE_DFLT,
1946 .fc_metric = IP6_RT_PRIO_USER,
1947 .fc_ifindex = dev->ifindex,
1948 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1949 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1951 .fc_nlinfo.nlh = NULL,
1952 .fc_nlinfo.nl_net = dev_net(dev),
1955 cfg.fc_gateway = *gwaddr;
1957 ip6_route_add(&cfg);
1959 return rt6_get_dflt_router(gwaddr, dev);
1962 void rt6_purge_dflt_routers(struct net *net)
1964 struct rt6_info *rt;
1965 struct fib6_table *table;
1967 /* NOTE: Keep consistent with rt6_get_dflt_router */
1968 table = fib6_get_table(net, RT6_TABLE_DFLT);
1973 read_lock_bh(&table->tb6_lock);
1974 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1975 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1977 read_unlock_bh(&table->tb6_lock);
1982 read_unlock_bh(&table->tb6_lock);
1985 static void rtmsg_to_fib6_config(struct net *net,
1986 struct in6_rtmsg *rtmsg,
1987 struct fib6_config *cfg)
1989 memset(cfg, 0, sizeof(*cfg));
1991 cfg->fc_table = RT6_TABLE_MAIN;
1992 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1993 cfg->fc_metric = rtmsg->rtmsg_metric;
1994 cfg->fc_expires = rtmsg->rtmsg_info;
1995 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1996 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1997 cfg->fc_flags = rtmsg->rtmsg_flags;
1999 cfg->fc_nlinfo.nl_net = net;
2001 cfg->fc_dst = rtmsg->rtmsg_dst;
2002 cfg->fc_src = rtmsg->rtmsg_src;
2003 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2006 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2008 struct fib6_config cfg;
2009 struct in6_rtmsg rtmsg;
2013 case SIOCADDRT: /* Add a route */
2014 case SIOCDELRT: /* Delete a route */
2015 if (!capable(CAP_NET_ADMIN))
2017 err = copy_from_user(&rtmsg, arg,
2018 sizeof(struct in6_rtmsg));
2022 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2027 err = ip6_route_add(&cfg);
2030 err = ip6_route_del(&cfg);
2044 * Drop the packet on the floor
2047 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2050 struct dst_entry *dst = skb_dst(skb);
2051 switch (ipstats_mib_noroutes) {
2052 case IPSTATS_MIB_INNOROUTES:
2053 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2054 if (type == IPV6_ADDR_ANY) {
2055 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2056 IPSTATS_MIB_INADDRERRORS);
2060 case IPSTATS_MIB_OUTNOROUTES:
2061 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2062 ipstats_mib_noroutes);
2065 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2070 static int ip6_pkt_discard(struct sk_buff *skb)
2072 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2075 static int ip6_pkt_discard_out(struct sk_buff *skb)
2077 skb->dev = skb_dst(skb)->dev;
2078 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2081 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2083 static int ip6_pkt_prohibit(struct sk_buff *skb)
2085 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2088 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2090 skb->dev = skb_dst(skb)->dev;
2091 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2097 * Allocate a dst for local (unicast / anycast) address.
2100 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2101 const struct in6_addr *addr,
2104 struct net *net = dev_net(idev->dev);
2105 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2106 net->loopback_dev, 0);
2110 if (net_ratelimit())
2111 pr_warning("IPv6: Maximum number of routes reached,"
2112 " consider increasing route/max_size.\n");
2113 return ERR_PTR(-ENOMEM);
2118 rt->dst.flags |= DST_HOST;
2119 rt->dst.input = ip6_input;
2120 rt->dst.output = ip6_output;
2121 rt->rt6i_idev = idev;
2122 rt->dst.obsolete = -1;
2124 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2126 rt->rt6i_flags |= RTF_ANYCAST;
2128 rt->rt6i_flags |= RTF_LOCAL;
2129 err = rt6_bind_neighbour(rt, rt->dst.dev);
2132 return ERR_PTR(err);
2135 rt->rt6i_dst.addr = *addr;
2136 rt->rt6i_dst.plen = 128;
2137 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2139 atomic_set(&rt->dst.__refcnt, 1);
2144 int ip6_route_get_saddr(struct net *net,
2145 struct rt6_info *rt,
2146 const struct in6_addr *daddr,
2148 struct in6_addr *saddr)
2150 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2152 if (rt->rt6i_prefsrc.plen)
2153 *saddr = rt->rt6i_prefsrc.addr;
2155 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2156 daddr, prefs, saddr);
2160 /* remove deleted ip from prefsrc entries */
2161 struct arg_dev_net_ip {
2162 struct net_device *dev;
2164 struct in6_addr *addr;
2167 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2169 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2170 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2171 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2173 if (((void *)rt->dst.dev == dev || !dev) &&
2174 rt != net->ipv6.ip6_null_entry &&
2175 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2176 /* remove prefsrc entry */
2177 rt->rt6i_prefsrc.plen = 0;
2182 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2184 struct net *net = dev_net(ifp->idev->dev);
2185 struct arg_dev_net_ip adni = {
2186 .dev = ifp->idev->dev,
2190 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2193 struct arg_dev_net {
2194 struct net_device *dev;
2198 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2200 const struct arg_dev_net *adn = arg;
2201 const struct net_device *dev = adn->dev;
2203 if ((rt->dst.dev == dev || !dev) &&
2204 rt != adn->net->ipv6.ip6_null_entry)
2210 void rt6_ifdown(struct net *net, struct net_device *dev)
2212 struct arg_dev_net adn = {
2217 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2218 icmp6_clean_all(fib6_ifdown, &adn);
2221 struct rt6_mtu_change_arg
2223 struct net_device *dev;
2227 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2229 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2230 struct inet6_dev *idev;
2232 /* In IPv6 pmtu discovery is not optional,
2233 so that RTAX_MTU lock cannot disable it.
2234 We still use this lock to block changes
2235 caused by addrconf/ndisc.
2238 idev = __in6_dev_get(arg->dev);
2242 /* For administrative MTU increase, there is no way to discover
2243 IPv6 PMTU increase, so PMTU increase should be updated here.
2244 Since RFC 1981 doesn't include administrative MTU increase
2245 update PMTU increase is a MUST. (i.e. jumbo frame)
2248 If new MTU is less than route PMTU, this new MTU will be the
2249 lowest MTU in the path, update the route PMTU to reflect PMTU
2250 decreases; if new MTU is greater than route PMTU, and the
2251 old MTU is the lowest MTU in the path, update the route PMTU
2252 to reflect the increase. In this case if the other nodes' MTU
2253 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2256 if (rt->dst.dev == arg->dev &&
2257 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2258 (dst_mtu(&rt->dst) >= arg->mtu ||
2259 (dst_mtu(&rt->dst) < arg->mtu &&
2260 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2261 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2266 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2268 struct rt6_mtu_change_arg arg = {
2273 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2276 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2277 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2278 [RTA_OIF] = { .type = NLA_U32 },
2279 [RTA_IIF] = { .type = NLA_U32 },
2280 [RTA_PRIORITY] = { .type = NLA_U32 },
2281 [RTA_METRICS] = { .type = NLA_NESTED },
2284 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2285 struct fib6_config *cfg)
2288 struct nlattr *tb[RTA_MAX+1];
2291 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2296 rtm = nlmsg_data(nlh);
2297 memset(cfg, 0, sizeof(*cfg));
2299 cfg->fc_table = rtm->rtm_table;
2300 cfg->fc_dst_len = rtm->rtm_dst_len;
2301 cfg->fc_src_len = rtm->rtm_src_len;
2302 cfg->fc_flags = RTF_UP;
2303 cfg->fc_protocol = rtm->rtm_protocol;
2305 if (rtm->rtm_type == RTN_UNREACHABLE)
2306 cfg->fc_flags |= RTF_REJECT;
2308 if (rtm->rtm_type == RTN_LOCAL)
2309 cfg->fc_flags |= RTF_LOCAL;
2311 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2312 cfg->fc_nlinfo.nlh = nlh;
2313 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2315 if (tb[RTA_GATEWAY]) {
2316 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2317 cfg->fc_flags |= RTF_GATEWAY;
2321 int plen = (rtm->rtm_dst_len + 7) >> 3;
2323 if (nla_len(tb[RTA_DST]) < plen)
2326 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2330 int plen = (rtm->rtm_src_len + 7) >> 3;
2332 if (nla_len(tb[RTA_SRC]) < plen)
2335 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2338 if (tb[RTA_PREFSRC])
2339 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2342 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2344 if (tb[RTA_PRIORITY])
2345 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2347 if (tb[RTA_METRICS]) {
2348 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2349 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2353 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2360 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2362 struct fib6_config cfg;
2365 err = rtm_to_fib6_config(skb, nlh, &cfg);
2369 return ip6_route_del(&cfg);
2372 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2374 struct fib6_config cfg;
2377 err = rtm_to_fib6_config(skb, nlh, &cfg);
2381 return ip6_route_add(&cfg);
2384 static inline size_t rt6_nlmsg_size(void)
2386 return NLMSG_ALIGN(sizeof(struct rtmsg))
2387 + nla_total_size(16) /* RTA_SRC */
2388 + nla_total_size(16) /* RTA_DST */
2389 + nla_total_size(16) /* RTA_GATEWAY */
2390 + nla_total_size(16) /* RTA_PREFSRC */
2391 + nla_total_size(4) /* RTA_TABLE */
2392 + nla_total_size(4) /* RTA_IIF */
2393 + nla_total_size(4) /* RTA_OIF */
2394 + nla_total_size(4) /* RTA_PRIORITY */
2395 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2396 + nla_total_size(sizeof(struct rta_cacheinfo));
2399 static int rt6_fill_node(struct net *net,
2400 struct sk_buff *skb, struct rt6_info *rt,
2401 struct in6_addr *dst, struct in6_addr *src,
2402 int iif, int type, u32 pid, u32 seq,
2403 int prefix, int nowait, unsigned int flags)
2405 const struct inet_peer *peer;
2407 struct nlmsghdr *nlh;
2410 struct neighbour *n;
2413 if (prefix) { /* user wants prefix routes only */
2414 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2415 /* success since this is not a prefix route */
2420 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2424 rtm = nlmsg_data(nlh);
2425 rtm->rtm_family = AF_INET6;
2426 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2427 rtm->rtm_src_len = rt->rt6i_src.plen;
2430 table = rt->rt6i_table->tb6_id;
2432 table = RT6_TABLE_UNSPEC;
2433 rtm->rtm_table = table;
2434 NLA_PUT_U32(skb, RTA_TABLE, table);
2435 if (rt->rt6i_flags & RTF_REJECT)
2436 rtm->rtm_type = RTN_UNREACHABLE;
2437 else if (rt->rt6i_flags & RTF_LOCAL)
2438 rtm->rtm_type = RTN_LOCAL;
2439 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2440 rtm->rtm_type = RTN_LOCAL;
2442 rtm->rtm_type = RTN_UNICAST;
2444 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2445 rtm->rtm_protocol = rt->rt6i_protocol;
2446 if (rt->rt6i_flags & RTF_DYNAMIC)
2447 rtm->rtm_protocol = RTPROT_REDIRECT;
2448 else if (rt->rt6i_flags & RTF_ADDRCONF)
2449 rtm->rtm_protocol = RTPROT_KERNEL;
2450 else if (rt->rt6i_flags & RTF_DEFAULT)
2451 rtm->rtm_protocol = RTPROT_RA;
2453 if (rt->rt6i_flags & RTF_CACHE)
2454 rtm->rtm_flags |= RTM_F_CLONED;
2457 NLA_PUT(skb, RTA_DST, 16, dst);
2458 rtm->rtm_dst_len = 128;
2459 } else if (rtm->rtm_dst_len)
2460 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2461 #ifdef CONFIG_IPV6_SUBTREES
2463 NLA_PUT(skb, RTA_SRC, 16, src);
2464 rtm->rtm_src_len = 128;
2465 } else if (rtm->rtm_src_len)
2466 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2469 #ifdef CONFIG_IPV6_MROUTE
2470 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2471 int err = ip6mr_get_route(net, skb, rtm, nowait);
2476 goto nla_put_failure;
2478 if (err == -EMSGSIZE)
2479 goto nla_put_failure;
2484 NLA_PUT_U32(skb, RTA_IIF, iif);
2486 struct in6_addr saddr_buf;
2487 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2488 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2491 if (rt->rt6i_prefsrc.plen) {
2492 struct in6_addr saddr_buf;
2493 saddr_buf = rt->rt6i_prefsrc.addr;
2494 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2497 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2498 goto nla_put_failure;
2501 n = dst_get_neighbour_noref(&rt->dst);
2503 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2505 goto nla_put_failure;
2511 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2513 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2515 if (!(rt->rt6i_flags & RTF_EXPIRES))
2517 else if (rt->dst.expires - jiffies < INT_MAX)
2518 expires = rt->dst.expires - jiffies;
2522 peer = rt->rt6i_peer;
2524 if (peer && peer->tcp_ts_stamp) {
2526 tsage = get_seconds() - peer->tcp_ts_stamp;
2529 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2530 expires, rt->dst.error) < 0)
2531 goto nla_put_failure;
2533 return nlmsg_end(skb, nlh);
2536 nlmsg_cancel(skb, nlh);
2540 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2542 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2545 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2546 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2547 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2551 return rt6_fill_node(arg->net,
2552 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2553 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2554 prefix, 0, NLM_F_MULTI);
2557 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2559 struct net *net = sock_net(in_skb->sk);
2560 struct nlattr *tb[RTA_MAX+1];
2561 struct rt6_info *rt;
2562 struct sk_buff *skb;
2565 int err, iif = 0, oif = 0;
2567 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2572 memset(&fl6, 0, sizeof(fl6));
2575 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2578 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2582 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2585 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2589 iif = nla_get_u32(tb[RTA_IIF]);
2592 oif = nla_get_u32(tb[RTA_OIF]);
2595 struct net_device *dev;
2598 dev = __dev_get_by_index(net, iif);
2604 fl6.flowi6_iif = iif;
2606 if (!ipv6_addr_any(&fl6.saddr))
2607 flags |= RT6_LOOKUP_F_HAS_SADDR;
2609 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2612 fl6.flowi6_oif = oif;
2614 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2617 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2623 /* Reserve room for dummy headers, this skb can pass
2624 through good chunk of routing engine.
2626 skb_reset_mac_header(skb);
2627 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2629 skb_dst_set(skb, &rt->dst);
2631 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2632 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2633 nlh->nlmsg_seq, 0, 0, 0);
2639 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2644 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2646 struct sk_buff *skb;
2647 struct net *net = info->nl_net;
2652 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2654 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2658 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2659 event, info->pid, seq, 0, 0, 0);
2661 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2662 WARN_ON(err == -EMSGSIZE);
2666 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2667 info->nlh, gfp_any());
2671 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2674 static int ip6_route_dev_notify(struct notifier_block *this,
2675 unsigned long event, void *data)
2677 struct net_device *dev = (struct net_device *)data;
2678 struct net *net = dev_net(dev);
2680 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2681 net->ipv6.ip6_null_entry->dst.dev = dev;
2682 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2683 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2684 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2685 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2686 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2687 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2698 #ifdef CONFIG_PROC_FS
2709 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2711 struct seq_file *m = p_arg;
2712 struct neighbour *n;
2714 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2716 #ifdef CONFIG_IPV6_SUBTREES
2717 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2719 seq_puts(m, "00000000000000000000000000000000 00 ");
2722 n = dst_get_neighbour_noref(&rt->dst);
2724 seq_printf(m, "%pi6", n->primary_key);
2726 seq_puts(m, "00000000000000000000000000000000");
2729 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2730 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2731 rt->dst.__use, rt->rt6i_flags,
2732 rt->dst.dev ? rt->dst.dev->name : "");
2736 static int ipv6_route_show(struct seq_file *m, void *v)
2738 struct net *net = (struct net *)m->private;
2739 fib6_clean_all_ro(net, rt6_info_route, 0, m);
2743 static int ipv6_route_open(struct inode *inode, struct file *file)
2745 return single_open_net(inode, file, ipv6_route_show);
2748 static const struct file_operations ipv6_route_proc_fops = {
2749 .owner = THIS_MODULE,
2750 .open = ipv6_route_open,
2752 .llseek = seq_lseek,
2753 .release = single_release_net,
2756 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2758 struct net *net = (struct net *)seq->private;
2759 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2760 net->ipv6.rt6_stats->fib_nodes,
2761 net->ipv6.rt6_stats->fib_route_nodes,
2762 net->ipv6.rt6_stats->fib_rt_alloc,
2763 net->ipv6.rt6_stats->fib_rt_entries,
2764 net->ipv6.rt6_stats->fib_rt_cache,
2765 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2766 net->ipv6.rt6_stats->fib_discarded_routes);
2771 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2773 return single_open_net(inode, file, rt6_stats_seq_show);
2776 static const struct file_operations rt6_stats_seq_fops = {
2777 .owner = THIS_MODULE,
2778 .open = rt6_stats_seq_open,
2780 .llseek = seq_lseek,
2781 .release = single_release_net,
2783 #endif /* CONFIG_PROC_FS */
2785 #ifdef CONFIG_SYSCTL
2788 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2789 void __user *buffer, size_t *lenp, loff_t *ppos)
2796 net = (struct net *)ctl->extra1;
2797 delay = net->ipv6.sysctl.flush_delay;
2798 proc_dointvec(ctl, write, buffer, lenp, ppos);
2799 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2803 ctl_table ipv6_route_table_template[] = {
2805 .procname = "flush",
2806 .data = &init_net.ipv6.sysctl.flush_delay,
2807 .maxlen = sizeof(int),
2809 .proc_handler = ipv6_sysctl_rtcache_flush
2812 .procname = "gc_thresh",
2813 .data = &ip6_dst_ops_template.gc_thresh,
2814 .maxlen = sizeof(int),
2816 .proc_handler = proc_dointvec,
2819 .procname = "max_size",
2820 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2821 .maxlen = sizeof(int),
2823 .proc_handler = proc_dointvec,
2826 .procname = "gc_min_interval",
2827 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2828 .maxlen = sizeof(int),
2830 .proc_handler = proc_dointvec_jiffies,
2833 .procname = "gc_timeout",
2834 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2835 .maxlen = sizeof(int),
2837 .proc_handler = proc_dointvec_jiffies,
2840 .procname = "gc_interval",
2841 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2842 .maxlen = sizeof(int),
2844 .proc_handler = proc_dointvec_jiffies,
2847 .procname = "gc_elasticity",
2848 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2849 .maxlen = sizeof(int),
2851 .proc_handler = proc_dointvec,
2854 .procname = "mtu_expires",
2855 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2856 .maxlen = sizeof(int),
2858 .proc_handler = proc_dointvec_jiffies,
2861 .procname = "min_adv_mss",
2862 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2863 .maxlen = sizeof(int),
2865 .proc_handler = proc_dointvec,
2868 .procname = "gc_min_interval_ms",
2869 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2870 .maxlen = sizeof(int),
2872 .proc_handler = proc_dointvec_ms_jiffies,
2877 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2879 struct ctl_table *table;
2881 table = kmemdup(ipv6_route_table_template,
2882 sizeof(ipv6_route_table_template),
2886 table[0].data = &net->ipv6.sysctl.flush_delay;
2887 table[0].extra1 = net;
2888 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2889 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2890 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2891 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2892 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2893 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2894 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2895 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2896 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2903 static int __net_init ip6_route_net_init(struct net *net)
2907 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2908 sizeof(net->ipv6.ip6_dst_ops));
2910 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2911 goto out_ip6_dst_ops;
2913 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2914 sizeof(*net->ipv6.ip6_null_entry),
2916 if (!net->ipv6.ip6_null_entry)
2917 goto out_ip6_dst_entries;
2918 net->ipv6.ip6_null_entry->dst.path =
2919 (struct dst_entry *)net->ipv6.ip6_null_entry;
2920 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2921 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2922 ip6_template_metrics, true);
2924 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2925 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2926 sizeof(*net->ipv6.ip6_prohibit_entry),
2928 if (!net->ipv6.ip6_prohibit_entry)
2929 goto out_ip6_null_entry;
2930 net->ipv6.ip6_prohibit_entry->dst.path =
2931 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2932 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2933 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2934 ip6_template_metrics, true);
2936 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2937 sizeof(*net->ipv6.ip6_blk_hole_entry),
2939 if (!net->ipv6.ip6_blk_hole_entry)
2940 goto out_ip6_prohibit_entry;
2941 net->ipv6.ip6_blk_hole_entry->dst.path =
2942 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2943 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2944 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2945 ip6_template_metrics, true);
2948 net->ipv6.sysctl.flush_delay = 0;
2949 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2950 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2951 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2952 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2953 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2954 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2955 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2957 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2963 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2964 out_ip6_prohibit_entry:
2965 kfree(net->ipv6.ip6_prohibit_entry);
2967 kfree(net->ipv6.ip6_null_entry);
2969 out_ip6_dst_entries:
2970 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2975 static void __net_exit ip6_route_net_exit(struct net *net)
2977 kfree(net->ipv6.ip6_null_entry);
2978 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2979 kfree(net->ipv6.ip6_prohibit_entry);
2980 kfree(net->ipv6.ip6_blk_hole_entry);
2982 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2985 static int __net_init ip6_route_net_init_late(struct net *net)
2987 #ifdef CONFIG_PROC_FS
2988 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2989 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2994 static void __net_exit ip6_route_net_exit_late(struct net *net)
2996 #ifdef CONFIG_PROC_FS
2997 proc_net_remove(net, "ipv6_route");
2998 proc_net_remove(net, "rt6_stats");
3002 static struct pernet_operations ip6_route_net_ops = {
3003 .init = ip6_route_net_init,
3004 .exit = ip6_route_net_exit,
3007 static struct pernet_operations ip6_route_net_late_ops = {
3008 .init = ip6_route_net_init_late,
3009 .exit = ip6_route_net_exit_late,
3012 static struct notifier_block ip6_route_dev_notifier = {
3013 .notifier_call = ip6_route_dev_notify,
3017 int __init ip6_route_init(void)
3022 ip6_dst_ops_template.kmem_cachep =
3023 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3024 SLAB_HWCACHE_ALIGN, NULL);
3025 if (!ip6_dst_ops_template.kmem_cachep)
3028 ret = dst_entries_init(&ip6_dst_blackhole_ops);
3030 goto out_kmem_cache;
3032 ret = register_pernet_subsys(&ip6_route_net_ops);
3034 goto out_dst_entries;
3036 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3038 /* Registering of the loopback is done before this portion of code,
3039 * the loopback reference in rt6_info will not be taken, do it
3040 * manually for init_net */
3041 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3042 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3043 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3044 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3045 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3046 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3047 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3051 goto out_register_subsys;
3057 ret = fib6_rules_init();
3061 ret = register_pernet_subsys(&ip6_route_net_late_ops);
3063 goto fib6_rules_init;
3066 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3067 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3068 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3069 goto out_register_late_subsys;
3071 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3073 goto out_register_late_subsys;
3078 out_register_late_subsys:
3079 unregister_pernet_subsys(&ip6_route_net_late_ops);
3081 fib6_rules_cleanup();
3086 out_register_subsys:
3087 unregister_pernet_subsys(&ip6_route_net_ops);
3089 dst_entries_destroy(&ip6_dst_blackhole_ops);
3091 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3095 void ip6_route_cleanup(void)
3097 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3098 unregister_pernet_subsys(&ip6_route_net_late_ops);
3099 fib6_rules_cleanup();
3102 unregister_pernet_subsys(&ip6_route_net_ops);
3103 dst_entries_destroy(&ip6_dst_blackhole_ops);
3104 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);