2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
57 #include <net/dst_metadata.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
65 #include <asm/uaccess.h>
68 #include <linux/sysctl.h>
72 RT6_NUD_FAIL_HARD = -3,
73 RT6_NUD_FAIL_PROBE = -2,
74 RT6_NUD_FAIL_DO_RR = -1,
78 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
79 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
80 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
81 static unsigned int ip6_mtu(const struct dst_entry *dst);
82 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
83 static void ip6_dst_destroy(struct dst_entry *);
84 static void ip6_dst_ifdown(struct dst_entry *,
85 struct net_device *dev, int how);
86 static int ip6_dst_gc(struct dst_ops *ops);
88 static int ip6_pkt_discard(struct sk_buff *skb);
89 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
90 static int ip6_pkt_prohibit(struct sk_buff *skb);
91 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
92 static void ip6_link_failure(struct sk_buff *skb);
93 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
94 struct sk_buff *skb, u32 mtu);
95 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
97 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
98 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct net *net,
102 const struct in6_addr *prefix, int prefixlen,
103 const struct in6_addr *gwaddr, int ifindex,
105 static struct rt6_info *rt6_get_route_info(struct net *net,
106 const struct in6_addr *prefix, int prefixlen,
107 const struct in6_addr *gwaddr, int ifindex);
110 struct uncached_list {
112 struct list_head head;
115 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
117 static void rt6_uncached_list_add(struct rt6_info *rt)
119 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
121 rt->dst.flags |= DST_NOCACHE;
122 rt->rt6i_uncached_list = ul;
124 spin_lock_bh(&ul->lock);
125 list_add_tail(&rt->rt6i_uncached, &ul->head);
126 spin_unlock_bh(&ul->lock);
129 static void rt6_uncached_list_del(struct rt6_info *rt)
131 if (!list_empty(&rt->rt6i_uncached)) {
132 struct uncached_list *ul = rt->rt6i_uncached_list;
134 spin_lock_bh(&ul->lock);
135 list_del(&rt->rt6i_uncached);
136 spin_unlock_bh(&ul->lock);
140 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
142 struct net_device *loopback_dev = net->loopback_dev;
145 if (dev == loopback_dev)
148 for_each_possible_cpu(cpu) {
149 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
152 spin_lock_bh(&ul->lock);
153 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
154 struct inet6_dev *rt_idev = rt->rt6i_idev;
155 struct net_device *rt_dev = rt->dst.dev;
157 if (rt_idev->dev == dev) {
158 rt->rt6i_idev = in6_dev_get(loopback_dev);
159 in6_dev_put(rt_idev);
163 rt->dst.dev = loopback_dev;
164 dev_hold(rt->dst.dev);
168 spin_unlock_bh(&ul->lock);
172 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
174 return dst_metrics_write_ptr(rt->dst.from);
177 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
179 struct rt6_info *rt = (struct rt6_info *)dst;
181 if (rt->rt6i_flags & RTF_PCPU)
182 return rt6_pcpu_cow_metrics(rt);
183 else if (rt->rt6i_flags & RTF_CACHE)
186 return dst_cow_metrics_generic(dst, old);
189 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
193 struct in6_addr *p = &rt->rt6i_gateway;
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
198 return &ipv6_hdr(skb)->daddr;
202 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
206 struct rt6_info *rt = (struct rt6_info *) dst;
209 daddr = choose_neigh_daddr(rt, skb, daddr);
210 n = __ipv6_neigh_lookup(dst->dev, daddr);
213 return neigh_create(&nd_tbl, daddr, dst->dev);
216 static struct dst_ops ip6_dst_ops_template = {
220 .check = ip6_dst_check,
221 .default_advmss = ip6_default_advmss,
223 .cow_metrics = ipv6_cow_metrics,
224 .destroy = ip6_dst_destroy,
225 .ifdown = ip6_dst_ifdown,
226 .negative_advice = ip6_negative_advice,
227 .link_failure = ip6_link_failure,
228 .update_pmtu = ip6_rt_update_pmtu,
229 .redirect = rt6_do_redirect,
230 .local_out = __ip6_local_out,
231 .neigh_lookup = ip6_neigh_lookup,
234 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
236 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
238 return mtu ? : dst->dev->mtu;
241 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
242 struct sk_buff *skb, u32 mtu)
246 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
251 static struct dst_ops ip6_dst_blackhole_ops = {
253 .destroy = ip6_dst_destroy,
254 .check = ip6_dst_check,
255 .mtu = ip6_blackhole_mtu,
256 .default_advmss = ip6_default_advmss,
257 .update_pmtu = ip6_rt_blackhole_update_pmtu,
258 .redirect = ip6_rt_blackhole_redirect,
259 .cow_metrics = dst_cow_metrics_generic,
260 .neigh_lookup = ip6_neigh_lookup,
263 static const u32 ip6_template_metrics[RTAX_MAX] = {
264 [RTAX_HOPLIMIT - 1] = 0,
267 static const struct rt6_info ip6_null_entry_template = {
269 .__refcnt = ATOMIC_INIT(1),
271 .obsolete = DST_OBSOLETE_FORCE_CHK,
272 .error = -ENETUNREACH,
273 .input = ip6_pkt_discard,
274 .output = ip6_pkt_discard_out,
276 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
277 .rt6i_protocol = RTPROT_KERNEL,
278 .rt6i_metric = ~(u32) 0,
279 .rt6i_ref = ATOMIC_INIT(1),
282 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
284 static const struct rt6_info ip6_prohibit_entry_template = {
286 .__refcnt = ATOMIC_INIT(1),
288 .obsolete = DST_OBSOLETE_FORCE_CHK,
290 .input = ip6_pkt_prohibit,
291 .output = ip6_pkt_prohibit_out,
293 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
294 .rt6i_protocol = RTPROT_KERNEL,
295 .rt6i_metric = ~(u32) 0,
296 .rt6i_ref = ATOMIC_INIT(1),
299 static const struct rt6_info ip6_blk_hole_entry_template = {
301 .__refcnt = ATOMIC_INIT(1),
303 .obsolete = DST_OBSOLETE_FORCE_CHK,
305 .input = dst_discard,
306 .output = dst_discard_sk,
308 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
309 .rt6i_protocol = RTPROT_KERNEL,
310 .rt6i_metric = ~(u32) 0,
311 .rt6i_ref = ATOMIC_INIT(1),
316 static void rt6_info_init(struct rt6_info *rt)
318 struct dst_entry *dst = &rt->dst;
320 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
321 INIT_LIST_HEAD(&rt->rt6i_siblings);
322 INIT_LIST_HEAD(&rt->rt6i_uncached);
325 /* allocate dst with ip6_dst_ops */
326 static struct rt6_info *__ip6_dst_alloc(struct net *net,
327 struct net_device *dev,
330 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
331 0, DST_OBSOLETE_FORCE_CHK, flags);
339 static struct rt6_info *ip6_dst_alloc(struct net *net,
340 struct net_device *dev,
343 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
346 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
350 for_each_possible_cpu(cpu) {
353 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
354 /* no one shares rt */
358 dst_destroy((struct dst_entry *)rt);
366 static void ip6_dst_destroy(struct dst_entry *dst)
368 struct rt6_info *rt = (struct rt6_info *)dst;
369 struct dst_entry *from = dst->from;
370 struct inet6_dev *idev;
372 dst_destroy_metrics_generic(dst);
373 free_percpu(rt->rt6i_pcpu);
374 rt6_uncached_list_del(rt);
376 idev = rt->rt6i_idev;
378 rt->rt6i_idev = NULL;
386 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
389 struct rt6_info *rt = (struct rt6_info *)dst;
390 struct inet6_dev *idev = rt->rt6i_idev;
391 struct net_device *loopback_dev =
392 dev_net(dev)->loopback_dev;
394 if (dev != loopback_dev) {
395 if (idev && idev->dev == dev) {
396 struct inet6_dev *loopback_idev =
397 in6_dev_get(loopback_dev);
399 rt->rt6i_idev = loopback_idev;
406 static bool rt6_check_expired(const struct rt6_info *rt)
408 if (rt->rt6i_flags & RTF_EXPIRES) {
409 if (time_after(jiffies, rt->dst.expires))
411 } else if (rt->dst.from) {
412 return rt6_check_expired((struct rt6_info *) rt->dst.from);
417 /* Multipath route selection:
418 * Hash based function using packet header and flowlabel.
419 * Adapted from fib_info_hashfn()
421 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
422 const struct flowi6 *fl6)
424 unsigned int val = fl6->flowi6_proto;
426 val ^= ipv6_addr_hash(&fl6->daddr);
427 val ^= ipv6_addr_hash(&fl6->saddr);
429 /* Work only if this not encapsulated */
430 switch (fl6->flowi6_proto) {
434 val ^= (__force u16)fl6->fl6_sport;
435 val ^= (__force u16)fl6->fl6_dport;
439 val ^= (__force u16)fl6->fl6_icmp_type;
440 val ^= (__force u16)fl6->fl6_icmp_code;
443 /* RFC6438 recommands to use flowlabel */
444 val ^= (__force u32)fl6->flowlabel;
446 /* Perhaps, we need to tune, this function? */
447 val = val ^ (val >> 7) ^ (val >> 12);
448 return val % candidate_count;
451 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
452 struct flowi6 *fl6, int oif,
455 struct rt6_info *sibling, *next_sibling;
458 route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
459 /* Don't change the route, if route_choosen == 0
460 * (siblings does not include ourself)
463 list_for_each_entry_safe(sibling, next_sibling,
464 &match->rt6i_siblings, rt6i_siblings) {
466 if (route_choosen == 0) {
467 if (rt6_score_route(sibling, oif, strict) < 0)
477 * Route lookup. Any table->tb6_lock is implied.
480 static inline struct rt6_info *rt6_device_match(struct net *net,
482 const struct in6_addr *saddr,
486 struct rt6_info *local = NULL;
487 struct rt6_info *sprt;
489 if (!oif && ipv6_addr_any(saddr))
492 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
493 struct net_device *dev = sprt->dst.dev;
496 if (dev->ifindex == oif)
498 if (dev->flags & IFF_LOOPBACK) {
499 if (!sprt->rt6i_idev ||
500 sprt->rt6i_idev->dev->ifindex != oif) {
501 if (flags & RT6_LOOKUP_F_IFACE && oif)
503 if (local && (!oif ||
504 local->rt6i_idev->dev->ifindex == oif))
510 if (ipv6_chk_addr(net, saddr, dev,
511 flags & RT6_LOOKUP_F_IFACE))
520 if (flags & RT6_LOOKUP_F_IFACE)
521 return net->ipv6.ip6_null_entry;
527 #ifdef CONFIG_IPV6_ROUTER_PREF
528 struct __rt6_probe_work {
529 struct work_struct work;
530 struct in6_addr target;
531 struct net_device *dev;
534 static void rt6_probe_deferred(struct work_struct *w)
536 struct in6_addr mcaddr;
537 struct __rt6_probe_work *work =
538 container_of(w, struct __rt6_probe_work, work);
540 addrconf_addr_solict_mult(&work->target, &mcaddr);
541 ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL, NULL);
546 static void rt6_probe(struct rt6_info *rt)
548 struct __rt6_probe_work *work;
549 struct neighbour *neigh;
551 * Okay, this does not seem to be appropriate
552 * for now, however, we need to check if it
553 * is really so; aka Router Reachability Probing.
555 * Router Reachability Probe MUST be rate-limited
556 * to no more than one per minute.
558 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
561 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
563 if (neigh->nud_state & NUD_VALID)
567 write_lock(&neigh->lock);
568 if (!(neigh->nud_state & NUD_VALID) &&
571 rt->rt6i_idev->cnf.rtr_probe_interval)) {
572 work = kmalloc(sizeof(*work), GFP_ATOMIC);
574 __neigh_set_probe_once(neigh);
576 write_unlock(&neigh->lock);
578 work = kmalloc(sizeof(*work), GFP_ATOMIC);
582 INIT_WORK(&work->work, rt6_probe_deferred);
583 work->target = rt->rt6i_gateway;
584 dev_hold(rt->dst.dev);
585 work->dev = rt->dst.dev;
586 schedule_work(&work->work);
590 rcu_read_unlock_bh();
593 static inline void rt6_probe(struct rt6_info *rt)
599 * Default Router Selection (RFC 2461 6.3.6)
601 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
603 struct net_device *dev = rt->dst.dev;
604 if (!oif || dev->ifindex == oif)
606 if ((dev->flags & IFF_LOOPBACK) &&
607 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
612 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
614 struct neighbour *neigh;
615 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
617 if (rt->rt6i_flags & RTF_NONEXTHOP ||
618 !(rt->rt6i_flags & RTF_GATEWAY))
619 return RT6_NUD_SUCCEED;
622 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
624 read_lock(&neigh->lock);
625 if (neigh->nud_state & NUD_VALID)
626 ret = RT6_NUD_SUCCEED;
627 #ifdef CONFIG_IPV6_ROUTER_PREF
628 else if (!(neigh->nud_state & NUD_FAILED))
629 ret = RT6_NUD_SUCCEED;
631 ret = RT6_NUD_FAIL_PROBE;
633 read_unlock(&neigh->lock);
635 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
636 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
638 rcu_read_unlock_bh();
643 static int rt6_score_route(struct rt6_info *rt, int oif,
648 m = rt6_check_dev(rt, oif);
649 if (!m && (strict & RT6_LOOKUP_F_IFACE))
650 return RT6_NUD_FAIL_HARD;
651 #ifdef CONFIG_IPV6_ROUTER_PREF
652 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
654 if (strict & RT6_LOOKUP_F_REACHABLE) {
655 int n = rt6_check_neigh(rt);
662 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
663 int *mpri, struct rt6_info *match,
667 bool match_do_rr = false;
668 struct inet6_dev *idev = rt->rt6i_idev;
669 struct net_device *dev = rt->dst.dev;
671 if (dev && !netif_carrier_ok(dev) &&
672 idev->cnf.ignore_routes_with_linkdown)
675 if (rt6_check_expired(rt))
678 m = rt6_score_route(rt, oif, strict);
679 if (m == RT6_NUD_FAIL_DO_RR) {
681 m = 0; /* lowest valid score */
682 } else if (m == RT6_NUD_FAIL_HARD) {
686 if (strict & RT6_LOOKUP_F_REACHABLE)
689 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
691 *do_rr = match_do_rr;
699 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
700 struct rt6_info *rr_head,
701 u32 metric, int oif, int strict,
704 struct rt6_info *rt, *match, *cont;
709 for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
710 if (rt->rt6i_metric != metric) {
715 match = find_match(rt, oif, strict, &mpri, match, do_rr);
718 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
719 if (rt->rt6i_metric != metric) {
724 match = find_match(rt, oif, strict, &mpri, match, do_rr);
730 for (rt = cont; rt; rt = rt->dst.rt6_next)
731 match = find_match(rt, oif, strict, &mpri, match, do_rr);
736 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
738 struct rt6_info *match, *rt0;
744 fn->rr_ptr = rt0 = fn->leaf;
746 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
750 struct rt6_info *next = rt0->dst.rt6_next;
752 /* no entries matched; do round-robin */
753 if (!next || next->rt6i_metric != rt0->rt6i_metric)
760 net = dev_net(rt0->dst.dev);
761 return match ? match : net->ipv6.ip6_null_entry;
764 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
766 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
769 #ifdef CONFIG_IPV6_ROUTE_INFO
770 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
771 const struct in6_addr *gwaddr)
773 struct net *net = dev_net(dev);
774 struct route_info *rinfo = (struct route_info *) opt;
775 struct in6_addr prefix_buf, *prefix;
777 unsigned long lifetime;
780 if (len < sizeof(struct route_info)) {
784 /* Sanity check for prefix_len and length */
785 if (rinfo->length > 3) {
787 } else if (rinfo->prefix_len > 128) {
789 } else if (rinfo->prefix_len > 64) {
790 if (rinfo->length < 2) {
793 } else if (rinfo->prefix_len > 0) {
794 if (rinfo->length < 1) {
799 pref = rinfo->route_pref;
800 if (pref == ICMPV6_ROUTER_PREF_INVALID)
803 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
805 if (rinfo->length == 3)
806 prefix = (struct in6_addr *)rinfo->prefix;
808 /* this function is safe */
809 ipv6_addr_prefix(&prefix_buf,
810 (struct in6_addr *)rinfo->prefix,
812 prefix = &prefix_buf;
815 if (rinfo->prefix_len == 0)
816 rt = rt6_get_dflt_router(gwaddr, dev);
818 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
819 gwaddr, dev->ifindex);
821 if (rt && !lifetime) {
827 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
830 rt->rt6i_flags = RTF_ROUTEINFO |
831 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
834 if (!addrconf_finite_timeout(lifetime))
835 rt6_clean_expires(rt);
837 rt6_set_expires(rt, jiffies + HZ * lifetime);
845 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
846 struct in6_addr *saddr)
848 struct fib6_node *pn;
850 if (fn->fn_flags & RTN_TL_ROOT)
853 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
854 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
857 if (fn->fn_flags & RTN_RTINFO)
862 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
863 struct fib6_table *table,
864 struct flowi6 *fl6, int flags)
866 struct fib6_node *fn;
869 read_lock_bh(&table->tb6_lock);
870 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
873 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
874 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
875 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
876 if (rt == net->ipv6.ip6_null_entry) {
877 fn = fib6_backtrack(fn, &fl6->saddr);
881 dst_use(&rt->dst, jiffies);
882 read_unlock_bh(&table->tb6_lock);
887 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
890 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
892 EXPORT_SYMBOL_GPL(ip6_route_lookup);
894 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
895 const struct in6_addr *saddr, int oif, int strict)
897 struct flowi6 fl6 = {
901 struct dst_entry *dst;
902 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
905 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
906 flags |= RT6_LOOKUP_F_HAS_SADDR;
909 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
911 return (struct rt6_info *) dst;
917 EXPORT_SYMBOL(rt6_lookup);
919 /* ip6_ins_rt is called with FREE table->tb6_lock.
920 It takes new route entry, the addition fails by any reason the
921 route is freed. In any case, if caller does not hold it, it may
925 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
926 struct mx6_config *mxc)
929 struct fib6_table *table;
931 table = rt->rt6i_table;
932 write_lock_bh(&table->tb6_lock);
933 err = fib6_add(&table->tb6_root, rt, info, mxc);
934 write_unlock_bh(&table->tb6_lock);
939 int ip6_ins_rt(struct rt6_info *rt)
941 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
942 struct mx6_config mxc = { .mx = NULL, };
944 return __ip6_ins_rt(rt, &info, &mxc);
947 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
948 const struct in6_addr *daddr,
949 const struct in6_addr *saddr)
957 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
958 ort = (struct rt6_info *)ort->dst.from;
960 rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
965 ip6_rt_copy_init(rt, ort);
966 rt->rt6i_flags |= RTF_CACHE;
968 rt->dst.flags |= DST_HOST;
969 rt->rt6i_dst.addr = *daddr;
970 rt->rt6i_dst.plen = 128;
972 if (!rt6_is_gw_or_nonexthop(ort)) {
973 if (ort->rt6i_dst.plen != 128 &&
974 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
975 rt->rt6i_flags |= RTF_ANYCAST;
976 #ifdef CONFIG_IPV6_SUBTREES
977 if (rt->rt6i_src.plen && saddr) {
978 rt->rt6i_src.addr = *saddr;
979 rt->rt6i_src.plen = 128;
987 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
989 struct rt6_info *pcpu_rt;
991 pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
992 rt->dst.dev, rt->dst.flags);
996 ip6_rt_copy_init(pcpu_rt, rt);
997 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
998 pcpu_rt->rt6i_flags |= RTF_PCPU;
1002 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1003 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1005 struct rt6_info *pcpu_rt, **p;
1007 p = this_cpu_ptr(rt->rt6i_pcpu);
1011 dst_hold(&pcpu_rt->dst);
1012 rt6_dst_from_metrics_check(pcpu_rt);
1017 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1019 struct fib6_table *table = rt->rt6i_table;
1020 struct rt6_info *pcpu_rt, *prev, **p;
1022 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1024 struct net *net = dev_net(rt->dst.dev);
1026 dst_hold(&net->ipv6.ip6_null_entry->dst);
1027 return net->ipv6.ip6_null_entry;
1030 read_lock_bh(&table->tb6_lock);
1031 if (rt->rt6i_pcpu) {
1032 p = this_cpu_ptr(rt->rt6i_pcpu);
1033 prev = cmpxchg(p, NULL, pcpu_rt);
1035 /* If someone did it before us, return prev instead */
1036 dst_destroy(&pcpu_rt->dst);
1040 /* rt has been removed from the fib6 tree
1041 * before we have a chance to acquire the read_lock.
1042 * In this case, don't brother to create a pcpu rt
1043 * since rt is going away anyway. The next
1044 * dst_check() will trigger a re-lookup.
1046 dst_destroy(&pcpu_rt->dst);
1049 dst_hold(&pcpu_rt->dst);
1050 rt6_dst_from_metrics_check(pcpu_rt);
1051 read_unlock_bh(&table->tb6_lock);
1055 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1056 struct flowi6 *fl6, int flags)
1058 struct fib6_node *fn, *saved_fn;
1059 struct rt6_info *rt;
1062 strict |= flags & RT6_LOOKUP_F_IFACE;
1063 if (net->ipv6.devconf_all->forwarding == 0)
1064 strict |= RT6_LOOKUP_F_REACHABLE;
1066 read_lock_bh(&table->tb6_lock);
1068 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1072 rt = rt6_select(fn, oif, strict);
1073 if (rt->rt6i_nsiblings)
1074 rt = rt6_multipath_select(rt, fl6, oif, strict);
1075 if (rt == net->ipv6.ip6_null_entry) {
1076 fn = fib6_backtrack(fn, &fl6->saddr);
1078 goto redo_rt6_select;
1079 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1080 /* also consider unreachable route */
1081 strict &= ~RT6_LOOKUP_F_REACHABLE;
1083 goto redo_rt6_select;
1088 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1089 dst_use(&rt->dst, jiffies);
1090 read_unlock_bh(&table->tb6_lock);
1092 rt6_dst_from_metrics_check(rt);
1094 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1095 !(rt->rt6i_flags & RTF_GATEWAY))) {
1096 /* Create a RTF_CACHE clone which will not be
1097 * owned by the fib6 tree. It is for the special case where
1098 * the daddr in the skb during the neighbor look-up is different
1099 * from the fl6->daddr used to look-up route here.
1102 struct rt6_info *uncached_rt;
1104 dst_use(&rt->dst, jiffies);
1105 read_unlock_bh(&table->tb6_lock);
1107 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1108 dst_release(&rt->dst);
1111 rt6_uncached_list_add(uncached_rt);
1113 uncached_rt = net->ipv6.ip6_null_entry;
1115 dst_hold(&uncached_rt->dst);
1119 /* Get a percpu copy */
1121 struct rt6_info *pcpu_rt;
1123 rt->dst.lastuse = jiffies;
1125 pcpu_rt = rt6_get_pcpu_route(rt);
1128 read_unlock_bh(&table->tb6_lock);
1130 /* We have to do the read_unlock first
1131 * because rt6_make_pcpu_route() may trigger
1132 * ip6_dst_gc() which will take the write_lock.
1135 read_unlock_bh(&table->tb6_lock);
1136 pcpu_rt = rt6_make_pcpu_route(rt);
1137 dst_release(&rt->dst);
1145 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1146 struct flowi6 *fl6, int flags)
1148 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1151 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1152 struct net_device *dev,
1153 struct flowi6 *fl6, int flags)
1155 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1156 flags |= RT6_LOOKUP_F_IFACE;
1158 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1161 void ip6_route_input(struct sk_buff *skb)
1163 const struct ipv6hdr *iph = ipv6_hdr(skb);
1164 struct net *net = dev_net(skb->dev);
1165 int flags = RT6_LOOKUP_F_HAS_SADDR;
1166 struct ip_tunnel_info *tun_info;
1167 struct flowi6 fl6 = {
1168 .flowi6_iif = skb->dev->ifindex,
1169 .daddr = iph->daddr,
1170 .saddr = iph->saddr,
1171 .flowlabel = ip6_flowinfo(iph),
1172 .flowi6_mark = skb->mark,
1173 .flowi6_proto = iph->nexthdr,
1176 tun_info = skb_tunnel_info(skb);
1177 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1178 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1180 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1183 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1184 struct flowi6 *fl6, int flags)
1186 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1189 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1194 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1196 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1198 flags |= RT6_LOOKUP_F_IFACE;
1200 if (!ipv6_addr_any(&fl6->saddr))
1201 flags |= RT6_LOOKUP_F_HAS_SADDR;
1203 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1205 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1207 EXPORT_SYMBOL(ip6_route_output);
1209 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1211 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1212 struct dst_entry *new = NULL;
1214 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1220 new->input = dst_discard;
1221 new->output = dst_discard_sk;
1223 dst_copy_metrics(new, &ort->dst);
1224 rt->rt6i_idev = ort->rt6i_idev;
1226 in6_dev_hold(rt->rt6i_idev);
1228 rt->rt6i_gateway = ort->rt6i_gateway;
1229 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1230 rt->rt6i_metric = 0;
1232 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1233 #ifdef CONFIG_IPV6_SUBTREES
1234 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1240 dst_release(dst_orig);
1241 return new ? new : ERR_PTR(-ENOMEM);
1245 * Destination cache support functions
1248 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1251 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1252 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1255 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1257 if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1260 if (rt6_check_expired(rt))
1266 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1268 if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1269 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1275 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1277 struct rt6_info *rt;
1279 rt = (struct rt6_info *) dst;
1281 /* All IPV6 dsts are created with ->obsolete set to the value
1282 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1283 * into this function always.
1286 rt6_dst_from_metrics_check(rt);
1288 if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1289 return rt6_dst_from_check(rt, cookie);
1291 return rt6_check(rt, cookie);
1294 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1296 struct rt6_info *rt = (struct rt6_info *) dst;
1299 if (rt->rt6i_flags & RTF_CACHE) {
1300 if (rt6_check_expired(rt)) {
1312 static void ip6_link_failure(struct sk_buff *skb)
1314 struct rt6_info *rt;
1316 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1318 rt = (struct rt6_info *) skb_dst(skb);
1320 if (rt->rt6i_flags & RTF_CACHE) {
1323 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1324 rt->rt6i_node->fn_sernum = -1;
1329 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1331 struct net *net = dev_net(rt->dst.dev);
1333 rt->rt6i_flags |= RTF_MODIFIED;
1334 rt->rt6i_pmtu = mtu;
1335 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1338 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1339 const struct ipv6hdr *iph, u32 mtu)
1341 struct rt6_info *rt6 = (struct rt6_info *)dst;
1343 if (rt6->rt6i_flags & RTF_LOCAL)
1347 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1348 if (mtu >= dst_mtu(dst))
1351 if (rt6->rt6i_flags & RTF_CACHE) {
1352 rt6_do_update_pmtu(rt6, mtu);
1354 const struct in6_addr *daddr, *saddr;
1355 struct rt6_info *nrt6;
1358 daddr = &iph->daddr;
1359 saddr = &iph->saddr;
1361 daddr = &sk->sk_v6_daddr;
1362 saddr = &inet6_sk(sk)->saddr;
1366 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1368 rt6_do_update_pmtu(nrt6, mtu);
1370 /* ip6_ins_rt(nrt6) will bump the
1371 * rt6->rt6i_node->fn_sernum
1372 * which will fail the next rt6_check() and
1373 * invalidate the sk->sk_dst_cache.
1380 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1381 struct sk_buff *skb, u32 mtu)
1383 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1386 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1389 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1390 struct dst_entry *dst;
1393 memset(&fl6, 0, sizeof(fl6));
1394 fl6.flowi6_oif = oif;
1395 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1396 fl6.daddr = iph->daddr;
1397 fl6.saddr = iph->saddr;
1398 fl6.flowlabel = ip6_flowinfo(iph);
1400 dst = ip6_route_output(net, NULL, &fl6);
1402 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1405 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1407 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1409 ip6_update_pmtu(skb, sock_net(sk), mtu,
1410 sk->sk_bound_dev_if, sk->sk_mark);
1412 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1414 /* Handle redirects */
1415 struct ip6rd_flowi {
1417 struct in6_addr gateway;
1420 static struct rt6_info *__ip6_route_redirect(struct net *net,
1421 struct fib6_table *table,
1425 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1426 struct rt6_info *rt;
1427 struct fib6_node *fn;
1429 /* Get the "current" route for this destination and
1430 * check if the redirect has come from approriate router.
1432 * RFC 4861 specifies that redirects should only be
1433 * accepted if they come from the nexthop to the target.
1434 * Due to the way the routes are chosen, this notion
1435 * is a bit fuzzy and one might need to check all possible
1439 read_lock_bh(&table->tb6_lock);
1440 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1442 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1443 if (rt6_check_expired(rt))
1447 if (!(rt->rt6i_flags & RTF_GATEWAY))
1449 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1451 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1457 rt = net->ipv6.ip6_null_entry;
1458 else if (rt->dst.error) {
1459 rt = net->ipv6.ip6_null_entry;
1463 if (rt == net->ipv6.ip6_null_entry) {
1464 fn = fib6_backtrack(fn, &fl6->saddr);
1472 read_unlock_bh(&table->tb6_lock);
1477 static struct dst_entry *ip6_route_redirect(struct net *net,
1478 const struct flowi6 *fl6,
1479 const struct in6_addr *gateway)
1481 int flags = RT6_LOOKUP_F_HAS_SADDR;
1482 struct ip6rd_flowi rdfl;
1485 rdfl.gateway = *gateway;
1487 return fib6_rule_lookup(net, &rdfl.fl6,
1488 flags, __ip6_route_redirect);
1491 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1493 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1494 struct dst_entry *dst;
1497 memset(&fl6, 0, sizeof(fl6));
1498 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1499 fl6.flowi6_oif = oif;
1500 fl6.flowi6_mark = mark;
1501 fl6.daddr = iph->daddr;
1502 fl6.saddr = iph->saddr;
1503 fl6.flowlabel = ip6_flowinfo(iph);
1505 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1506 rt6_do_redirect(dst, NULL, skb);
1509 EXPORT_SYMBOL_GPL(ip6_redirect);
1511 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1514 const struct ipv6hdr *iph = ipv6_hdr(skb);
1515 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1516 struct dst_entry *dst;
1519 memset(&fl6, 0, sizeof(fl6));
1520 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1521 fl6.flowi6_oif = oif;
1522 fl6.flowi6_mark = mark;
1523 fl6.daddr = msg->dest;
1524 fl6.saddr = iph->daddr;
1526 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1527 rt6_do_redirect(dst, NULL, skb);
1531 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1533 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1535 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1537 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1539 struct net_device *dev = dst->dev;
1540 unsigned int mtu = dst_mtu(dst);
1541 struct net *net = dev_net(dev);
1543 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1545 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1546 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1549 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1550 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1551 * IPV6_MAXPLEN is also valid and means: "any MSS,
1552 * rely only on pmtu discovery"
1554 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1559 static unsigned int ip6_mtu(const struct dst_entry *dst)
1561 const struct rt6_info *rt = (const struct rt6_info *)dst;
1562 unsigned int mtu = rt->rt6i_pmtu;
1563 struct inet6_dev *idev;
1568 mtu = dst_metric_raw(dst, RTAX_MTU);
1575 idev = __in6_dev_get(dst->dev);
1577 mtu = idev->cnf.mtu6;
1581 return min_t(unsigned int, mtu, IP6_MAX_MTU);
1584 static struct dst_entry *icmp6_dst_gc_list;
1585 static DEFINE_SPINLOCK(icmp6_dst_lock);
1587 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1590 struct dst_entry *dst;
1591 struct rt6_info *rt;
1592 struct inet6_dev *idev = in6_dev_get(dev);
1593 struct net *net = dev_net(dev);
1595 if (unlikely(!idev))
1596 return ERR_PTR(-ENODEV);
1598 rt = ip6_dst_alloc(net, dev, 0);
1599 if (unlikely(!rt)) {
1601 dst = ERR_PTR(-ENOMEM);
1605 rt->dst.flags |= DST_HOST;
1606 rt->dst.output = ip6_output;
1607 atomic_set(&rt->dst.__refcnt, 1);
1608 rt->rt6i_gateway = fl6->daddr;
1609 rt->rt6i_dst.addr = fl6->daddr;
1610 rt->rt6i_dst.plen = 128;
1611 rt->rt6i_idev = idev;
1612 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1614 spin_lock_bh(&icmp6_dst_lock);
1615 rt->dst.next = icmp6_dst_gc_list;
1616 icmp6_dst_gc_list = &rt->dst;
1617 spin_unlock_bh(&icmp6_dst_lock);
1619 fib6_force_start_gc(net);
1621 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1627 int icmp6_dst_gc(void)
1629 struct dst_entry *dst, **pprev;
1632 spin_lock_bh(&icmp6_dst_lock);
1633 pprev = &icmp6_dst_gc_list;
1635 while ((dst = *pprev) != NULL) {
1636 if (!atomic_read(&dst->__refcnt)) {
1645 spin_unlock_bh(&icmp6_dst_lock);
1650 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1653 struct dst_entry *dst, **pprev;
1655 spin_lock_bh(&icmp6_dst_lock);
1656 pprev = &icmp6_dst_gc_list;
1657 while ((dst = *pprev) != NULL) {
1658 struct rt6_info *rt = (struct rt6_info *) dst;
1659 if (func(rt, arg)) {
1666 spin_unlock_bh(&icmp6_dst_lock);
1669 static int ip6_dst_gc(struct dst_ops *ops)
1671 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1672 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1673 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1674 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1675 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1676 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1679 entries = dst_entries_get_fast(ops);
1680 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1681 entries <= rt_max_size)
1684 net->ipv6.ip6_rt_gc_expire++;
1685 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1686 entries = dst_entries_get_slow(ops);
1687 if (entries < ops->gc_thresh)
1688 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1690 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1691 return entries > rt_max_size;
1694 static int ip6_convert_metrics(struct mx6_config *mxc,
1695 const struct fib6_config *cfg)
1697 bool ecn_ca = false;
1705 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1709 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1710 int type = nla_type(nla);
1715 if (unlikely(type > RTAX_MAX))
1718 if (type == RTAX_CC_ALGO) {
1719 char tmp[TCP_CA_NAME_MAX];
1721 nla_strlcpy(tmp, nla, sizeof(tmp));
1722 val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1723 if (val == TCP_CA_UNSPEC)
1726 val = nla_get_u32(nla);
1728 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1732 __set_bit(type - 1, mxc->mx_valid);
1736 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1737 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1747 int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
1750 struct net *net = cfg->fc_nlinfo.nl_net;
1751 struct rt6_info *rt = NULL;
1752 struct net_device *dev = NULL;
1753 struct inet6_dev *idev = NULL;
1754 struct fib6_table *table;
1757 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1759 #ifndef CONFIG_IPV6_SUBTREES
1760 if (cfg->fc_src_len)
1763 if (cfg->fc_ifindex) {
1765 dev = dev_get_by_index(net, cfg->fc_ifindex);
1768 idev = in6_dev_get(dev);
1773 if (cfg->fc_metric == 0)
1774 cfg->fc_metric = IP6_RT_PRIO_USER;
1777 if (cfg->fc_nlinfo.nlh &&
1778 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1779 table = fib6_get_table(net, cfg->fc_table);
1781 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1782 table = fib6_new_table(net, cfg->fc_table);
1785 table = fib6_new_table(net, cfg->fc_table);
1791 rt = ip6_dst_alloc(net, NULL,
1792 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1799 if (cfg->fc_flags & RTF_EXPIRES)
1800 rt6_set_expires(rt, jiffies +
1801 clock_t_to_jiffies(cfg->fc_expires));
1803 rt6_clean_expires(rt);
1805 if (cfg->fc_protocol == RTPROT_UNSPEC)
1806 cfg->fc_protocol = RTPROT_BOOT;
1807 rt->rt6i_protocol = cfg->fc_protocol;
1809 addr_type = ipv6_addr_type(&cfg->fc_dst);
1811 if (addr_type & IPV6_ADDR_MULTICAST)
1812 rt->dst.input = ip6_mc_input;
1813 else if (cfg->fc_flags & RTF_LOCAL)
1814 rt->dst.input = ip6_input;
1816 rt->dst.input = ip6_forward;
1818 rt->dst.output = ip6_output;
1820 if (cfg->fc_encap) {
1821 struct lwtunnel_state *lwtstate;
1823 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1824 cfg->fc_encap, AF_INET6, cfg,
1828 rt->dst.lwtstate = lwtstate_get(lwtstate);
1829 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1830 rt->dst.lwtstate->orig_output = rt->dst.output;
1831 rt->dst.output = lwtunnel_output;
1833 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1834 rt->dst.lwtstate->orig_input = rt->dst.input;
1835 rt->dst.input = lwtunnel_input;
1839 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1840 rt->rt6i_dst.plen = cfg->fc_dst_len;
1841 if (rt->rt6i_dst.plen == 128)
1842 rt->dst.flags |= DST_HOST;
1844 #ifdef CONFIG_IPV6_SUBTREES
1845 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1846 rt->rt6i_src.plen = cfg->fc_src_len;
1849 rt->rt6i_metric = cfg->fc_metric;
1851 /* We cannot add true routes via loopback here,
1852 they would result in kernel looping; promote them to reject routes
1854 if ((cfg->fc_flags & RTF_REJECT) ||
1855 (dev && (dev->flags & IFF_LOOPBACK) &&
1856 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1857 !(cfg->fc_flags & RTF_LOCAL))) {
1858 /* hold loopback dev/idev if we haven't done so. */
1859 if (dev != net->loopback_dev) {
1864 dev = net->loopback_dev;
1866 idev = in6_dev_get(dev);
1872 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1873 switch (cfg->fc_type) {
1875 rt->dst.error = -EINVAL;
1876 rt->dst.output = dst_discard_sk;
1877 rt->dst.input = dst_discard;
1880 rt->dst.error = -EACCES;
1881 rt->dst.output = ip6_pkt_prohibit_out;
1882 rt->dst.input = ip6_pkt_prohibit;
1885 case RTN_UNREACHABLE:
1887 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1888 : (cfg->fc_type == RTN_UNREACHABLE)
1889 ? -EHOSTUNREACH : -ENETUNREACH;
1890 rt->dst.output = ip6_pkt_discard_out;
1891 rt->dst.input = ip6_pkt_discard;
1897 if (cfg->fc_flags & RTF_GATEWAY) {
1898 const struct in6_addr *gw_addr;
1901 gw_addr = &cfg->fc_gateway;
1902 gwa_type = ipv6_addr_type(gw_addr);
1904 /* if gw_addr is local we will fail to detect this in case
1905 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1906 * will return already-added prefix route via interface that
1907 * prefix route was assigned to, which might be non-loopback.
1910 if (ipv6_chk_addr_and_flags(net, gw_addr,
1911 gwa_type & IPV6_ADDR_LINKLOCAL ?
1915 rt->rt6i_gateway = *gw_addr;
1917 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1918 struct rt6_info *grt;
1920 /* IPv6 strictly inhibits using not link-local
1921 addresses as nexthop address.
1922 Otherwise, router will not able to send redirects.
1923 It is very good, but in some (rare!) circumstances
1924 (SIT, PtP, NBMA NOARP links) it is handy to allow
1925 some exceptions. --ANK
1927 if (!(gwa_type & IPV6_ADDR_UNICAST))
1930 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1932 err = -EHOSTUNREACH;
1936 if (dev != grt->dst.dev) {
1942 idev = grt->rt6i_idev;
1944 in6_dev_hold(grt->rt6i_idev);
1946 if (!(grt->rt6i_flags & RTF_GATEWAY))
1954 if (!dev || (dev->flags & IFF_LOOPBACK))
1962 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1963 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1967 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1968 rt->rt6i_prefsrc.plen = 128;
1970 rt->rt6i_prefsrc.plen = 0;
1972 rt->rt6i_flags = cfg->fc_flags;
1976 rt->rt6i_idev = idev;
1977 rt->rt6i_table = table;
1979 cfg->fc_nlinfo.nl_net = dev_net(dev);
1997 int ip6_route_add(struct fib6_config *cfg)
1999 struct mx6_config mxc = { .mx = NULL, };
2000 struct rt6_info *rt = NULL;
2003 err = ip6_route_info_create(cfg, &rt);
2007 err = ip6_convert_metrics(&mxc, cfg);
2011 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2023 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2026 struct fib6_table *table;
2027 struct net *net = dev_net(rt->dst.dev);
2029 if (rt == net->ipv6.ip6_null_entry ||
2030 rt->dst.flags & DST_NOCACHE) {
2035 table = rt->rt6i_table;
2036 write_lock_bh(&table->tb6_lock);
2037 err = fib6_del(rt, info);
2038 write_unlock_bh(&table->tb6_lock);
2045 int ip6_del_rt(struct rt6_info *rt)
2047 struct nl_info info = {
2048 .nl_net = dev_net(rt->dst.dev),
2050 return __ip6_del_rt(rt, &info);
2053 static int ip6_route_del(struct fib6_config *cfg)
2055 struct fib6_table *table;
2056 struct fib6_node *fn;
2057 struct rt6_info *rt;
2060 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2064 read_lock_bh(&table->tb6_lock);
2066 fn = fib6_locate(&table->tb6_root,
2067 &cfg->fc_dst, cfg->fc_dst_len,
2068 &cfg->fc_src, cfg->fc_src_len);
2071 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2072 if ((rt->rt6i_flags & RTF_CACHE) &&
2073 !(cfg->fc_flags & RTF_CACHE))
2075 if (cfg->fc_ifindex &&
2077 rt->dst.dev->ifindex != cfg->fc_ifindex))
2079 if (cfg->fc_flags & RTF_GATEWAY &&
2080 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2082 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2085 read_unlock_bh(&table->tb6_lock);
2087 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2090 read_unlock_bh(&table->tb6_lock);
2095 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2097 struct net *net = dev_net(skb->dev);
2098 struct netevent_redirect netevent;
2099 struct rt6_info *rt, *nrt = NULL;
2100 struct ndisc_options ndopts;
2101 struct inet6_dev *in6_dev;
2102 struct neighbour *neigh;
2104 int optlen, on_link;
2107 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2108 optlen -= sizeof(*msg);
2111 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2115 msg = (struct rd_msg *)icmp6_hdr(skb);
2117 if (ipv6_addr_is_multicast(&msg->dest)) {
2118 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2123 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2125 } else if (ipv6_addr_type(&msg->target) !=
2126 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2127 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2131 in6_dev = __in6_dev_get(skb->dev);
2134 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2138 * The IP source address of the Redirect MUST be the same as the current
2139 * first-hop router for the specified ICMP Destination Address.
2142 if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2143 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2148 if (ndopts.nd_opts_tgt_lladdr) {
2149 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2152 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2157 rt = (struct rt6_info *) dst;
2158 if (rt == net->ipv6.ip6_null_entry) {
2159 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2163 /* Redirect received -> path was valid.
2164 * Look, redirects are sent only in response to data packets,
2165 * so that this nexthop apparently is reachable. --ANK
2167 dst_confirm(&rt->dst);
2169 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2174 * We have finally decided to accept it.
2177 neigh_update(neigh, lladdr, NUD_STALE,
2178 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2179 NEIGH_UPDATE_F_OVERRIDE|
2180 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2181 NEIGH_UPDATE_F_ISROUTER))
2184 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2188 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2190 nrt->rt6i_flags &= ~RTF_GATEWAY;
2192 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2194 if (ip6_ins_rt(nrt))
2197 netevent.old = &rt->dst;
2198 netevent.new = &nrt->dst;
2199 netevent.daddr = &msg->dest;
2200 netevent.neigh = neigh;
2201 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2203 if (rt->rt6i_flags & RTF_CACHE) {
2204 rt = (struct rt6_info *) dst_clone(&rt->dst);
2209 neigh_release(neigh);
2213 * Misc support functions
2216 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2218 BUG_ON(from->dst.from);
2220 rt->rt6i_flags &= ~RTF_EXPIRES;
2221 dst_hold(&from->dst);
2222 rt->dst.from = &from->dst;
2223 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2226 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2228 rt->dst.input = ort->dst.input;
2229 rt->dst.output = ort->dst.output;
2230 rt->rt6i_dst = ort->rt6i_dst;
2231 rt->dst.error = ort->dst.error;
2232 rt->rt6i_idev = ort->rt6i_idev;
2234 in6_dev_hold(rt->rt6i_idev);
2235 rt->dst.lastuse = jiffies;
2236 rt->rt6i_gateway = ort->rt6i_gateway;
2237 rt->rt6i_flags = ort->rt6i_flags;
2238 rt6_set_from(rt, ort);
2239 rt->rt6i_metric = ort->rt6i_metric;
2240 #ifdef CONFIG_IPV6_SUBTREES
2241 rt->rt6i_src = ort->rt6i_src;
2243 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2244 rt->rt6i_table = ort->rt6i_table;
2245 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2248 #ifdef CONFIG_IPV6_ROUTE_INFO
2249 static struct rt6_info *rt6_get_route_info(struct net *net,
2250 const struct in6_addr *prefix, int prefixlen,
2251 const struct in6_addr *gwaddr, int ifindex)
2253 struct fib6_node *fn;
2254 struct rt6_info *rt = NULL;
2255 struct fib6_table *table;
2257 table = fib6_get_table(net, RT6_TABLE_INFO);
2261 read_lock_bh(&table->tb6_lock);
2262 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2266 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2267 if (rt->dst.dev->ifindex != ifindex)
2269 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2271 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2277 read_unlock_bh(&table->tb6_lock);
2281 static struct rt6_info *rt6_add_route_info(struct net *net,
2282 const struct in6_addr *prefix, int prefixlen,
2283 const struct in6_addr *gwaddr, int ifindex,
2286 struct fib6_config cfg = {
2287 .fc_table = RT6_TABLE_INFO,
2288 .fc_metric = IP6_RT_PRIO_USER,
2289 .fc_ifindex = ifindex,
2290 .fc_dst_len = prefixlen,
2291 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2292 RTF_UP | RTF_PREF(pref),
2293 .fc_nlinfo.portid = 0,
2294 .fc_nlinfo.nlh = NULL,
2295 .fc_nlinfo.nl_net = net,
2298 cfg.fc_dst = *prefix;
2299 cfg.fc_gateway = *gwaddr;
2301 /* We should treat it as a default route if prefix length is 0. */
2303 cfg.fc_flags |= RTF_DEFAULT;
2305 ip6_route_add(&cfg);
2307 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2311 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2313 struct rt6_info *rt;
2314 struct fib6_table *table;
2316 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2320 read_lock_bh(&table->tb6_lock);
2321 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2322 if (dev == rt->dst.dev &&
2323 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2324 ipv6_addr_equal(&rt->rt6i_gateway, addr))
2329 read_unlock_bh(&table->tb6_lock);
2333 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2334 struct net_device *dev,
2337 struct fib6_config cfg = {
2338 .fc_table = RT6_TABLE_DFLT,
2339 .fc_metric = IP6_RT_PRIO_USER,
2340 .fc_ifindex = dev->ifindex,
2341 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2342 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2343 .fc_nlinfo.portid = 0,
2344 .fc_nlinfo.nlh = NULL,
2345 .fc_nlinfo.nl_net = dev_net(dev),
2348 cfg.fc_gateway = *gwaddr;
2350 ip6_route_add(&cfg);
2352 return rt6_get_dflt_router(gwaddr, dev);
2355 void rt6_purge_dflt_routers(struct net *net)
2357 struct rt6_info *rt;
2358 struct fib6_table *table;
2360 /* NOTE: Keep consistent with rt6_get_dflt_router */
2361 table = fib6_get_table(net, RT6_TABLE_DFLT);
2366 read_lock_bh(&table->tb6_lock);
2367 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2368 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2369 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2371 read_unlock_bh(&table->tb6_lock);
2376 read_unlock_bh(&table->tb6_lock);
2379 static void rtmsg_to_fib6_config(struct net *net,
2380 struct in6_rtmsg *rtmsg,
2381 struct fib6_config *cfg)
2383 memset(cfg, 0, sizeof(*cfg));
2385 cfg->fc_table = RT6_TABLE_MAIN;
2386 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2387 cfg->fc_metric = rtmsg->rtmsg_metric;
2388 cfg->fc_expires = rtmsg->rtmsg_info;
2389 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2390 cfg->fc_src_len = rtmsg->rtmsg_src_len;
2391 cfg->fc_flags = rtmsg->rtmsg_flags;
2393 cfg->fc_nlinfo.nl_net = net;
2395 cfg->fc_dst = rtmsg->rtmsg_dst;
2396 cfg->fc_src = rtmsg->rtmsg_src;
2397 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2400 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2402 struct fib6_config cfg;
2403 struct in6_rtmsg rtmsg;
2407 case SIOCADDRT: /* Add a route */
2408 case SIOCDELRT: /* Delete a route */
2409 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2411 err = copy_from_user(&rtmsg, arg,
2412 sizeof(struct in6_rtmsg));
2416 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2421 err = ip6_route_add(&cfg);
2424 err = ip6_route_del(&cfg);
2438 * Drop the packet on the floor
2441 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2444 struct dst_entry *dst = skb_dst(skb);
2445 switch (ipstats_mib_noroutes) {
2446 case IPSTATS_MIB_INNOROUTES:
2447 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2448 if (type == IPV6_ADDR_ANY) {
2449 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2450 IPSTATS_MIB_INADDRERRORS);
2454 case IPSTATS_MIB_OUTNOROUTES:
2455 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2456 ipstats_mib_noroutes);
2459 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2464 static int ip6_pkt_discard(struct sk_buff *skb)
2466 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2469 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2471 skb->dev = skb_dst(skb)->dev;
2472 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2475 static int ip6_pkt_prohibit(struct sk_buff *skb)
2477 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2480 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2482 skb->dev = skb_dst(skb)->dev;
2483 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2487 * Allocate a dst for local (unicast / anycast) address.
2490 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2491 const struct in6_addr *addr,
2494 struct net *net = dev_net(idev->dev);
2495 struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2498 return ERR_PTR(-ENOMEM);
2502 rt->dst.flags |= DST_HOST;
2503 rt->dst.input = ip6_input;
2504 rt->dst.output = ip6_output;
2505 rt->rt6i_idev = idev;
2507 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2509 rt->rt6i_flags |= RTF_ANYCAST;
2511 rt->rt6i_flags |= RTF_LOCAL;
2513 rt->rt6i_gateway = *addr;
2514 rt->rt6i_dst.addr = *addr;
2515 rt->rt6i_dst.plen = 128;
2516 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2517 rt->dst.flags |= DST_NOCACHE;
2519 atomic_set(&rt->dst.__refcnt, 1);
2524 int ip6_route_get_saddr(struct net *net,
2525 struct rt6_info *rt,
2526 const struct in6_addr *daddr,
2528 struct in6_addr *saddr)
2530 struct inet6_dev *idev =
2531 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2533 if (rt && rt->rt6i_prefsrc.plen)
2534 *saddr = rt->rt6i_prefsrc.addr;
2536 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2537 daddr, prefs, saddr);
2541 /* remove deleted ip from prefsrc entries */
2542 struct arg_dev_net_ip {
2543 struct net_device *dev;
2545 struct in6_addr *addr;
2548 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2550 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2551 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2552 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2554 if (((void *)rt->dst.dev == dev || !dev) &&
2555 rt != net->ipv6.ip6_null_entry &&
2556 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2557 /* remove prefsrc entry */
2558 rt->rt6i_prefsrc.plen = 0;
2563 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2565 struct net *net = dev_net(ifp->idev->dev);
2566 struct arg_dev_net_ip adni = {
2567 .dev = ifp->idev->dev,
2571 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2574 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2575 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2577 /* Remove routers and update dst entries when gateway turn into host. */
2578 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2580 struct in6_addr *gateway = (struct in6_addr *)arg;
2582 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2583 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2584 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2590 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2592 fib6_clean_all(net, fib6_clean_tohost, gateway);
2595 struct arg_dev_net {
2596 struct net_device *dev;
2600 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2602 const struct arg_dev_net *adn = arg;
2603 const struct net_device *dev = adn->dev;
2605 if ((rt->dst.dev == dev || !dev) &&
2606 rt != adn->net->ipv6.ip6_null_entry)
2612 void rt6_ifdown(struct net *net, struct net_device *dev)
2614 struct arg_dev_net adn = {
2619 fib6_clean_all(net, fib6_ifdown, &adn);
2620 icmp6_clean_all(fib6_ifdown, &adn);
2622 rt6_uncached_list_flush_dev(net, dev);
2625 struct rt6_mtu_change_arg {
2626 struct net_device *dev;
2630 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2632 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2633 struct inet6_dev *idev;
2635 /* In IPv6 pmtu discovery is not optional,
2636 so that RTAX_MTU lock cannot disable it.
2637 We still use this lock to block changes
2638 caused by addrconf/ndisc.
2641 idev = __in6_dev_get(arg->dev);
2645 /* For administrative MTU increase, there is no way to discover
2646 IPv6 PMTU increase, so PMTU increase should be updated here.
2647 Since RFC 1981 doesn't include administrative MTU increase
2648 update PMTU increase is a MUST. (i.e. jumbo frame)
2651 If new MTU is less than route PMTU, this new MTU will be the
2652 lowest MTU in the path, update the route PMTU to reflect PMTU
2653 decreases; if new MTU is greater than route PMTU, and the
2654 old MTU is the lowest MTU in the path, update the route PMTU
2655 to reflect the increase. In this case if the other nodes' MTU
2656 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2659 if (rt->dst.dev == arg->dev &&
2660 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2661 if (rt->rt6i_flags & RTF_CACHE) {
2662 /* For RTF_CACHE with rt6i_pmtu == 0
2663 * (i.e. a redirected route),
2664 * the metrics of its rt->dst.from has already
2667 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2668 rt->rt6i_pmtu = arg->mtu;
2669 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2670 (dst_mtu(&rt->dst) < arg->mtu &&
2671 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2672 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2678 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2680 struct rt6_mtu_change_arg arg = {
2685 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2688 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2689 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2690 [RTA_OIF] = { .type = NLA_U32 },
2691 [RTA_IIF] = { .type = NLA_U32 },
2692 [RTA_PRIORITY] = { .type = NLA_U32 },
2693 [RTA_METRICS] = { .type = NLA_NESTED },
2694 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2695 [RTA_PREF] = { .type = NLA_U8 },
2696 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
2697 [RTA_ENCAP] = { .type = NLA_NESTED },
2700 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2701 struct fib6_config *cfg)
2704 struct nlattr *tb[RTA_MAX+1];
2708 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2713 rtm = nlmsg_data(nlh);
2714 memset(cfg, 0, sizeof(*cfg));
2716 cfg->fc_table = rtm->rtm_table;
2717 cfg->fc_dst_len = rtm->rtm_dst_len;
2718 cfg->fc_src_len = rtm->rtm_src_len;
2719 cfg->fc_flags = RTF_UP;
2720 cfg->fc_protocol = rtm->rtm_protocol;
2721 cfg->fc_type = rtm->rtm_type;
2723 if (rtm->rtm_type == RTN_UNREACHABLE ||
2724 rtm->rtm_type == RTN_BLACKHOLE ||
2725 rtm->rtm_type == RTN_PROHIBIT ||
2726 rtm->rtm_type == RTN_THROW)
2727 cfg->fc_flags |= RTF_REJECT;
2729 if (rtm->rtm_type == RTN_LOCAL)
2730 cfg->fc_flags |= RTF_LOCAL;
2732 if (rtm->rtm_flags & RTM_F_CLONED)
2733 cfg->fc_flags |= RTF_CACHE;
2735 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2736 cfg->fc_nlinfo.nlh = nlh;
2737 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2739 if (tb[RTA_GATEWAY]) {
2740 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2741 cfg->fc_flags |= RTF_GATEWAY;
2745 int plen = (rtm->rtm_dst_len + 7) >> 3;
2747 if (nla_len(tb[RTA_DST]) < plen)
2750 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2754 int plen = (rtm->rtm_src_len + 7) >> 3;
2756 if (nla_len(tb[RTA_SRC]) < plen)
2759 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2762 if (tb[RTA_PREFSRC])
2763 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2766 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2768 if (tb[RTA_PRIORITY])
2769 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2771 if (tb[RTA_METRICS]) {
2772 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2773 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2777 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2779 if (tb[RTA_MULTIPATH]) {
2780 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2781 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2785 pref = nla_get_u8(tb[RTA_PREF]);
2786 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2787 pref != ICMPV6_ROUTER_PREF_HIGH)
2788 pref = ICMPV6_ROUTER_PREF_MEDIUM;
2789 cfg->fc_flags |= RTF_PREF(pref);
2793 cfg->fc_encap = tb[RTA_ENCAP];
2795 if (tb[RTA_ENCAP_TYPE])
2796 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2804 struct rt6_info *rt6_info;
2805 struct fib6_config r_cfg;
2806 struct mx6_config mxc;
2807 struct list_head next;
2810 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2814 list_for_each_entry(nh, rt6_nh_list, next) {
2815 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
2816 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2817 nh->r_cfg.fc_ifindex);
2821 static int ip6_route_info_append(struct list_head *rt6_nh_list,
2822 struct rt6_info *rt, struct fib6_config *r_cfg)
2825 struct rt6_info *rtnh;
2828 list_for_each_entry(nh, rt6_nh_list, next) {
2829 /* check if rt6_info already exists */
2830 rtnh = nh->rt6_info;
2832 if (rtnh->dst.dev == rt->dst.dev &&
2833 rtnh->rt6i_idev == rt->rt6i_idev &&
2834 ipv6_addr_equal(&rtnh->rt6i_gateway,
2839 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
2843 err = ip6_convert_metrics(&nh->mxc, r_cfg);
2848 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
2849 list_add_tail(&nh->next, rt6_nh_list);
2854 static int ip6_route_multipath_add(struct fib6_config *cfg)
2856 struct fib6_config r_cfg;
2857 struct rtnexthop *rtnh;
2858 struct rt6_info *rt;
2859 struct rt6_nh *err_nh;
2860 struct rt6_nh *nh, *nh_safe;
2865 int replace = (cfg->fc_nlinfo.nlh &&
2866 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
2867 LIST_HEAD(rt6_nh_list);
2869 remaining = cfg->fc_mp_len;
2870 rtnh = (struct rtnexthop *)cfg->fc_mp;
2872 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
2873 * rt6_info structs per nexthop
2875 while (rtnh_ok(rtnh, remaining)) {
2876 memcpy(&r_cfg, cfg, sizeof(*cfg));
2877 if (rtnh->rtnh_ifindex)
2878 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2880 attrlen = rtnh_attrlen(rtnh);
2882 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2884 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2886 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2887 r_cfg.fc_flags |= RTF_GATEWAY;
2889 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2890 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2892 r_cfg.fc_encap_type = nla_get_u16(nla);
2895 err = ip6_route_info_create(&r_cfg, &rt);
2899 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
2905 rtnh = rtnh_next(rtnh, &remaining);
2909 list_for_each_entry(nh, &rt6_nh_list, next) {
2910 err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
2911 /* nh->rt6_info is used or freed at this point, reset to NULL*/
2912 nh->rt6_info = NULL;
2915 ip6_print_replace_route_err(&rt6_nh_list);
2920 /* Because each route is added like a single route we remove
2921 * these flags after the first nexthop: if there is a collision,
2922 * we have already failed to add the first nexthop:
2923 * fib6_add_rt2node() has rejected it; when replacing, old
2924 * nexthops have been replaced by first new, the rest should
2927 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2935 /* Delete routes that were already added */
2936 list_for_each_entry(nh, &rt6_nh_list, next) {
2939 ip6_route_del(&nh->r_cfg);
2943 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
2945 dst_free(&nh->rt6_info->dst);
2947 list_del(&nh->next);
2954 static int ip6_route_multipath_del(struct fib6_config *cfg)
2956 struct fib6_config r_cfg;
2957 struct rtnexthop *rtnh;
2960 int err = 1, last_err = 0;
2962 remaining = cfg->fc_mp_len;
2963 rtnh = (struct rtnexthop *)cfg->fc_mp;
2965 /* Parse a Multipath Entry */
2966 while (rtnh_ok(rtnh, remaining)) {
2967 memcpy(&r_cfg, cfg, sizeof(*cfg));
2968 if (rtnh->rtnh_ifindex)
2969 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2971 attrlen = rtnh_attrlen(rtnh);
2973 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2975 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2977 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
2978 r_cfg.fc_flags |= RTF_GATEWAY;
2981 err = ip6_route_del(&r_cfg);
2985 rtnh = rtnh_next(rtnh, &remaining);
2991 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2993 struct fib6_config cfg;
2996 err = rtm_to_fib6_config(skb, nlh, &cfg);
3001 return ip6_route_multipath_del(&cfg);
3003 return ip6_route_del(&cfg);
3006 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3008 struct fib6_config cfg;
3011 err = rtm_to_fib6_config(skb, nlh, &cfg);
3016 return ip6_route_multipath_add(&cfg);
3018 return ip6_route_add(&cfg);
3021 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
3023 return NLMSG_ALIGN(sizeof(struct rtmsg))
3024 + nla_total_size(16) /* RTA_SRC */
3025 + nla_total_size(16) /* RTA_DST */
3026 + nla_total_size(16) /* RTA_GATEWAY */
3027 + nla_total_size(16) /* RTA_PREFSRC */
3028 + nla_total_size(4) /* RTA_TABLE */
3029 + nla_total_size(4) /* RTA_IIF */
3030 + nla_total_size(4) /* RTA_OIF */
3031 + nla_total_size(4) /* RTA_PRIORITY */
3032 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3033 + nla_total_size(sizeof(struct rta_cacheinfo))
3034 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3035 + nla_total_size(1) /* RTA_PREF */
3036 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3039 static int rt6_fill_node(struct net *net,
3040 struct sk_buff *skb, struct rt6_info *rt,
3041 struct in6_addr *dst, struct in6_addr *src,
3042 int iif, int type, u32 portid, u32 seq,
3043 int prefix, int nowait, unsigned int flags)
3045 u32 metrics[RTAX_MAX];
3047 struct nlmsghdr *nlh;
3051 if (prefix) { /* user wants prefix routes only */
3052 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
3053 /* success since this is not a prefix route */
3058 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3062 rtm = nlmsg_data(nlh);
3063 rtm->rtm_family = AF_INET6;
3064 rtm->rtm_dst_len = rt->rt6i_dst.plen;
3065 rtm->rtm_src_len = rt->rt6i_src.plen;
3068 table = rt->rt6i_table->tb6_id;
3070 table = RT6_TABLE_UNSPEC;
3071 rtm->rtm_table = table;
3072 if (nla_put_u32(skb, RTA_TABLE, table))
3073 goto nla_put_failure;
3074 if (rt->rt6i_flags & RTF_REJECT) {
3075 switch (rt->dst.error) {
3077 rtm->rtm_type = RTN_BLACKHOLE;
3080 rtm->rtm_type = RTN_PROHIBIT;
3083 rtm->rtm_type = RTN_THROW;
3086 rtm->rtm_type = RTN_UNREACHABLE;
3090 else if (rt->rt6i_flags & RTF_LOCAL)
3091 rtm->rtm_type = RTN_LOCAL;
3092 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3093 rtm->rtm_type = RTN_LOCAL;
3095 rtm->rtm_type = RTN_UNICAST;
3097 if (!netif_carrier_ok(rt->dst.dev)) {
3098 rtm->rtm_flags |= RTNH_F_LINKDOWN;
3099 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3100 rtm->rtm_flags |= RTNH_F_DEAD;
3102 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3103 rtm->rtm_protocol = rt->rt6i_protocol;
3104 if (rt->rt6i_flags & RTF_DYNAMIC)
3105 rtm->rtm_protocol = RTPROT_REDIRECT;
3106 else if (rt->rt6i_flags & RTF_ADDRCONF) {
3107 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3108 rtm->rtm_protocol = RTPROT_RA;
3110 rtm->rtm_protocol = RTPROT_KERNEL;
3113 if (rt->rt6i_flags & RTF_CACHE)
3114 rtm->rtm_flags |= RTM_F_CLONED;
3117 if (nla_put_in6_addr(skb, RTA_DST, dst))
3118 goto nla_put_failure;
3119 rtm->rtm_dst_len = 128;
3120 } else if (rtm->rtm_dst_len)
3121 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3122 goto nla_put_failure;
3123 #ifdef CONFIG_IPV6_SUBTREES
3125 if (nla_put_in6_addr(skb, RTA_SRC, src))
3126 goto nla_put_failure;
3127 rtm->rtm_src_len = 128;
3128 } else if (rtm->rtm_src_len &&
3129 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3130 goto nla_put_failure;
3133 #ifdef CONFIG_IPV6_MROUTE
3134 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3135 int err = ip6mr_get_route(net, skb, rtm, nowait);
3140 goto nla_put_failure;
3142 if (err == -EMSGSIZE)
3143 goto nla_put_failure;
3148 if (nla_put_u32(skb, RTA_IIF, iif))
3149 goto nla_put_failure;
3151 struct in6_addr saddr_buf;
3152 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3153 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3154 goto nla_put_failure;
3157 if (rt->rt6i_prefsrc.plen) {
3158 struct in6_addr saddr_buf;
3159 saddr_buf = rt->rt6i_prefsrc.addr;
3160 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3161 goto nla_put_failure;
3164 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3166 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3167 if (rtnetlink_put_metrics(skb, metrics) < 0)
3168 goto nla_put_failure;
3170 if (rt->rt6i_flags & RTF_GATEWAY) {
3171 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3172 goto nla_put_failure;
3176 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3177 goto nla_put_failure;
3178 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3179 goto nla_put_failure;
3181 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3183 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3184 goto nla_put_failure;
3186 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3187 goto nla_put_failure;
3189 lwtunnel_fill_encap(skb, rt->dst.lwtstate);
3191 nlmsg_end(skb, nlh);
3195 nlmsg_cancel(skb, nlh);
3199 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3201 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3204 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3205 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3206 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3210 return rt6_fill_node(arg->net,
3211 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3212 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3213 prefix, 0, NLM_F_MULTI);
3216 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3218 struct net *net = sock_net(in_skb->sk);
3219 struct nlattr *tb[RTA_MAX+1];
3220 struct rt6_info *rt;
3221 struct sk_buff *skb;
3224 int err, iif = 0, oif = 0;
3226 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3231 memset(&fl6, 0, sizeof(fl6));
3234 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3237 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3241 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3244 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3248 iif = nla_get_u32(tb[RTA_IIF]);
3251 oif = nla_get_u32(tb[RTA_OIF]);
3254 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3257 struct net_device *dev;
3260 dev = __dev_get_by_index(net, iif);
3266 fl6.flowi6_iif = iif;
3268 if (!ipv6_addr_any(&fl6.saddr))
3269 flags |= RT6_LOOKUP_F_HAS_SADDR;
3271 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3274 fl6.flowi6_oif = oif;
3276 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3279 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3286 /* Reserve room for dummy headers, this skb can pass
3287 through good chunk of routing engine.
3289 skb_reset_mac_header(skb);
3290 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3292 skb_dst_set(skb, &rt->dst);
3294 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3295 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3296 nlh->nlmsg_seq, 0, 0, 0);
3302 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3307 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3308 unsigned int nlm_flags)
3310 struct sk_buff *skb;
3311 struct net *net = info->nl_net;
3316 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3318 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3322 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3323 event, info->portid, seq, 0, 0, nlm_flags);
3325 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3326 WARN_ON(err == -EMSGSIZE);
3330 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3331 info->nlh, gfp_any());
3335 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3338 static int ip6_route_dev_notify(struct notifier_block *this,
3339 unsigned long event, void *ptr)
3341 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3342 struct net *net = dev_net(dev);
3344 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3345 net->ipv6.ip6_null_entry->dst.dev = dev;
3346 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3347 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3348 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3349 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3350 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3351 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3362 #ifdef CONFIG_PROC_FS
3364 static const struct file_operations ipv6_route_proc_fops = {
3365 .owner = THIS_MODULE,
3366 .open = ipv6_route_open,
3368 .llseek = seq_lseek,
3369 .release = seq_release_net,
3372 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3374 struct net *net = (struct net *)seq->private;
3375 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3376 net->ipv6.rt6_stats->fib_nodes,
3377 net->ipv6.rt6_stats->fib_route_nodes,
3378 net->ipv6.rt6_stats->fib_rt_alloc,
3379 net->ipv6.rt6_stats->fib_rt_entries,
3380 net->ipv6.rt6_stats->fib_rt_cache,
3381 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3382 net->ipv6.rt6_stats->fib_discarded_routes);
3387 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3389 return single_open_net(inode, file, rt6_stats_seq_show);
3392 static const struct file_operations rt6_stats_seq_fops = {
3393 .owner = THIS_MODULE,
3394 .open = rt6_stats_seq_open,
3396 .llseek = seq_lseek,
3397 .release = single_release_net,
3399 #endif /* CONFIG_PROC_FS */
3401 #ifdef CONFIG_SYSCTL
3404 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3405 void __user *buffer, size_t *lenp, loff_t *ppos)
3412 net = (struct net *)ctl->extra1;
3413 delay = net->ipv6.sysctl.flush_delay;
3414 proc_dointvec(ctl, write, buffer, lenp, ppos);
3415 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3419 struct ctl_table ipv6_route_table_template[] = {
3421 .procname = "flush",
3422 .data = &init_net.ipv6.sysctl.flush_delay,
3423 .maxlen = sizeof(int),
3425 .proc_handler = ipv6_sysctl_rtcache_flush
3428 .procname = "gc_thresh",
3429 .data = &ip6_dst_ops_template.gc_thresh,
3430 .maxlen = sizeof(int),
3432 .proc_handler = proc_dointvec,
3435 .procname = "max_size",
3436 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
3437 .maxlen = sizeof(int),
3439 .proc_handler = proc_dointvec,
3442 .procname = "gc_min_interval",
3443 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3444 .maxlen = sizeof(int),
3446 .proc_handler = proc_dointvec_jiffies,
3449 .procname = "gc_timeout",
3450 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3451 .maxlen = sizeof(int),
3453 .proc_handler = proc_dointvec_jiffies,
3456 .procname = "gc_interval",
3457 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3458 .maxlen = sizeof(int),
3460 .proc_handler = proc_dointvec_jiffies,
3463 .procname = "gc_elasticity",
3464 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3465 .maxlen = sizeof(int),
3467 .proc_handler = proc_dointvec,
3470 .procname = "mtu_expires",
3471 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3472 .maxlen = sizeof(int),
3474 .proc_handler = proc_dointvec_jiffies,
3477 .procname = "min_adv_mss",
3478 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3479 .maxlen = sizeof(int),
3481 .proc_handler = proc_dointvec,
3484 .procname = "gc_min_interval_ms",
3485 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3486 .maxlen = sizeof(int),
3488 .proc_handler = proc_dointvec_ms_jiffies,
3493 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3495 struct ctl_table *table;
3497 table = kmemdup(ipv6_route_table_template,
3498 sizeof(ipv6_route_table_template),
3502 table[0].data = &net->ipv6.sysctl.flush_delay;
3503 table[0].extra1 = net;
3504 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3505 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3506 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3507 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3508 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3509 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3510 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3511 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3512 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3514 /* Don't export sysctls to unprivileged users */
3515 if (net->user_ns != &init_user_ns)
3516 table[0].procname = NULL;
3523 static int __net_init ip6_route_net_init(struct net *net)
3527 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3528 sizeof(net->ipv6.ip6_dst_ops));
3530 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3531 goto out_ip6_dst_ops;
3533 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3534 sizeof(*net->ipv6.ip6_null_entry),
3536 if (!net->ipv6.ip6_null_entry)
3537 goto out_ip6_dst_entries;
3538 net->ipv6.ip6_null_entry->dst.path =
3539 (struct dst_entry *)net->ipv6.ip6_null_entry;
3540 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3541 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3542 ip6_template_metrics, true);
3544 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3545 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3546 sizeof(*net->ipv6.ip6_prohibit_entry),
3548 if (!net->ipv6.ip6_prohibit_entry)
3549 goto out_ip6_null_entry;
3550 net->ipv6.ip6_prohibit_entry->dst.path =
3551 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3552 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3553 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3554 ip6_template_metrics, true);
3556 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3557 sizeof(*net->ipv6.ip6_blk_hole_entry),
3559 if (!net->ipv6.ip6_blk_hole_entry)
3560 goto out_ip6_prohibit_entry;
3561 net->ipv6.ip6_blk_hole_entry->dst.path =
3562 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3563 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3564 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3565 ip6_template_metrics, true);
3568 net->ipv6.sysctl.flush_delay = 0;
3569 net->ipv6.sysctl.ip6_rt_max_size = 4096;
3570 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3571 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3572 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3573 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3574 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3575 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3577 net->ipv6.ip6_rt_gc_expire = 30*HZ;
3583 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3584 out_ip6_prohibit_entry:
3585 kfree(net->ipv6.ip6_prohibit_entry);
3587 kfree(net->ipv6.ip6_null_entry);
3589 out_ip6_dst_entries:
3590 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3595 static void __net_exit ip6_route_net_exit(struct net *net)
3597 kfree(net->ipv6.ip6_null_entry);
3598 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3599 kfree(net->ipv6.ip6_prohibit_entry);
3600 kfree(net->ipv6.ip6_blk_hole_entry);
3602 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3605 static int __net_init ip6_route_net_init_late(struct net *net)
3607 #ifdef CONFIG_PROC_FS
3608 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3609 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3614 static void __net_exit ip6_route_net_exit_late(struct net *net)
3616 #ifdef CONFIG_PROC_FS
3617 remove_proc_entry("ipv6_route", net->proc_net);
3618 remove_proc_entry("rt6_stats", net->proc_net);
3622 static struct pernet_operations ip6_route_net_ops = {
3623 .init = ip6_route_net_init,
3624 .exit = ip6_route_net_exit,
3627 static int __net_init ipv6_inetpeer_init(struct net *net)
3629 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3633 inet_peer_base_init(bp);
3634 net->ipv6.peers = bp;
3638 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3640 struct inet_peer_base *bp = net->ipv6.peers;
3642 net->ipv6.peers = NULL;
3643 inetpeer_invalidate_tree(bp);
3647 static struct pernet_operations ipv6_inetpeer_ops = {
3648 .init = ipv6_inetpeer_init,
3649 .exit = ipv6_inetpeer_exit,
3652 static struct pernet_operations ip6_route_net_late_ops = {
3653 .init = ip6_route_net_init_late,
3654 .exit = ip6_route_net_exit_late,
3657 static struct notifier_block ip6_route_dev_notifier = {
3658 .notifier_call = ip6_route_dev_notify,
3662 int __init ip6_route_init(void)
3668 ip6_dst_ops_template.kmem_cachep =
3669 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3670 SLAB_HWCACHE_ALIGN, NULL);
3671 if (!ip6_dst_ops_template.kmem_cachep)
3674 ret = dst_entries_init(&ip6_dst_blackhole_ops);
3676 goto out_kmem_cache;
3678 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3680 goto out_dst_entries;
3682 ret = register_pernet_subsys(&ip6_route_net_ops);
3684 goto out_register_inetpeer;
3686 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3688 /* Registering of the loopback is done before this portion of code,
3689 * the loopback reference in rt6_info will not be taken, do it
3690 * manually for init_net */
3691 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3692 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3693 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3694 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3695 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3696 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3697 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3701 goto out_register_subsys;
3707 ret = fib6_rules_init();
3711 ret = register_pernet_subsys(&ip6_route_net_late_ops);
3713 goto fib6_rules_init;
3716 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3717 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3718 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3719 goto out_register_late_subsys;
3721 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3723 goto out_register_late_subsys;
3725 for_each_possible_cpu(cpu) {
3726 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3728 INIT_LIST_HEAD(&ul->head);
3729 spin_lock_init(&ul->lock);
3735 out_register_late_subsys:
3736 unregister_pernet_subsys(&ip6_route_net_late_ops);
3738 fib6_rules_cleanup();
3743 out_register_subsys:
3744 unregister_pernet_subsys(&ip6_route_net_ops);
3745 out_register_inetpeer:
3746 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3748 dst_entries_destroy(&ip6_dst_blackhole_ops);
3750 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3754 void ip6_route_cleanup(void)
3756 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3757 unregister_pernet_subsys(&ip6_route_net_late_ops);
3758 fib6_rules_cleanup();
3761 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3762 unregister_pernet_subsys(&ip6_route_net_ops);
3763 dst_entries_destroy(&ip6_dst_blackhole_ops);
3764 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);