]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv6/route.c
net: ipv6: Don't purge default router if accept_ra=2
[karo-tx-linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
66                                     const struct in6_addr *dest);
67 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
68 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
69 static unsigned int      ip6_mtu(const struct dst_entry *dst);
70 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
71 static void             ip6_dst_destroy(struct dst_entry *);
72 static void             ip6_dst_ifdown(struct dst_entry *,
73                                        struct net_device *dev, int how);
74 static int               ip6_dst_gc(struct dst_ops *ops);
75
76 static int              ip6_pkt_discard(struct sk_buff *skb);
77 static int              ip6_pkt_discard_out(struct sk_buff *skb);
78 static void             ip6_link_failure(struct sk_buff *skb);
79 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
80
81 #ifdef CONFIG_IPV6_ROUTE_INFO
82 static struct rt6_info *rt6_add_route_info(struct net *net,
83                                            const struct in6_addr *prefix, int prefixlen,
84                                            const struct in6_addr *gwaddr, int ifindex,
85                                            unsigned pref);
86 static struct rt6_info *rt6_get_route_info(struct net *net,
87                                            const struct in6_addr *prefix, int prefixlen,
88                                            const struct in6_addr *gwaddr, int ifindex);
89 #endif
90
91 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
92 {
93         struct rt6_info *rt = (struct rt6_info *) dst;
94         struct inet_peer *peer;
95         u32 *p = NULL;
96
97         if (!(rt->dst.flags & DST_HOST))
98                 return NULL;
99
100         if (!rt->rt6i_peer)
101                 rt6_bind_peer(rt, 1);
102
103         peer = rt->rt6i_peer;
104         if (peer) {
105                 u32 *old_p = __DST_METRICS_PTR(old);
106                 unsigned long prev, new;
107
108                 p = peer->metrics;
109                 if (inet_metrics_new(peer))
110                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
111
112                 new = (unsigned long) p;
113                 prev = cmpxchg(&dst->_metrics, old, new);
114
115                 if (prev != old) {
116                         p = __DST_METRICS_PTR(prev);
117                         if (prev & DST_METRICS_READ_ONLY)
118                                 p = NULL;
119                 }
120         }
121         return p;
122 }
123
124 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
125 {
126         struct in6_addr *p = &rt->rt6i_gateway;
127
128         if (!ipv6_addr_any(p))
129                 return (const void *) p;
130         return daddr;
131 }
132
133 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
134 {
135         struct rt6_info *rt = (struct rt6_info *) dst;
136         struct neighbour *n;
137
138         daddr = choose_neigh_daddr(rt, daddr);
139         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
140         if (n)
141                 return n;
142         return neigh_create(&nd_tbl, daddr, dst->dev);
143 }
144
145 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
146 {
147         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
148         if (!n) {
149                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
150                 if (IS_ERR(n))
151                         return PTR_ERR(n);
152         }
153         dst_set_neighbour(&rt->dst, n);
154
155         return 0;
156 }
157
158 static struct dst_ops ip6_dst_ops_template = {
159         .family                 =       AF_INET6,
160         .protocol               =       cpu_to_be16(ETH_P_IPV6),
161         .gc                     =       ip6_dst_gc,
162         .gc_thresh              =       1024,
163         .check                  =       ip6_dst_check,
164         .default_advmss         =       ip6_default_advmss,
165         .mtu                    =       ip6_mtu,
166         .cow_metrics            =       ipv6_cow_metrics,
167         .destroy                =       ip6_dst_destroy,
168         .ifdown                 =       ip6_dst_ifdown,
169         .negative_advice        =       ip6_negative_advice,
170         .link_failure           =       ip6_link_failure,
171         .update_pmtu            =       ip6_rt_update_pmtu,
172         .local_out              =       __ip6_local_out,
173         .neigh_lookup           =       ip6_neigh_lookup,
174 };
175
176 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
177 {
178         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
179
180         return mtu ? : dst->dev->mtu;
181 }
182
183 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
184 {
185 }
186
187 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
188                                          unsigned long old)
189 {
190         return NULL;
191 }
192
193 static struct dst_ops ip6_dst_blackhole_ops = {
194         .family                 =       AF_INET6,
195         .protocol               =       cpu_to_be16(ETH_P_IPV6),
196         .destroy                =       ip6_dst_destroy,
197         .check                  =       ip6_dst_check,
198         .mtu                    =       ip6_blackhole_mtu,
199         .default_advmss         =       ip6_default_advmss,
200         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
201         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
202         .neigh_lookup           =       ip6_neigh_lookup,
203 };
204
205 static const u32 ip6_template_metrics[RTAX_MAX] = {
206         [RTAX_HOPLIMIT - 1] = 0,
207 };
208
209 static struct rt6_info ip6_null_entry_template = {
210         .dst = {
211                 .__refcnt       = ATOMIC_INIT(1),
212                 .__use          = 1,
213                 .obsolete       = -1,
214                 .error          = -ENETUNREACH,
215                 .input          = ip6_pkt_discard,
216                 .output         = ip6_pkt_discard_out,
217         },
218         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
219         .rt6i_protocol  = RTPROT_KERNEL,
220         .rt6i_metric    = ~(u32) 0,
221         .rt6i_ref       = ATOMIC_INIT(1),
222 };
223
224 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
225
226 static int ip6_pkt_prohibit(struct sk_buff *skb);
227 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
228
229 static struct rt6_info ip6_prohibit_entry_template = {
230         .dst = {
231                 .__refcnt       = ATOMIC_INIT(1),
232                 .__use          = 1,
233                 .obsolete       = -1,
234                 .error          = -EACCES,
235                 .input          = ip6_pkt_prohibit,
236                 .output         = ip6_pkt_prohibit_out,
237         },
238         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
239         .rt6i_protocol  = RTPROT_KERNEL,
240         .rt6i_metric    = ~(u32) 0,
241         .rt6i_ref       = ATOMIC_INIT(1),
242 };
243
244 static struct rt6_info ip6_blk_hole_entry_template = {
245         .dst = {
246                 .__refcnt       = ATOMIC_INIT(1),
247                 .__use          = 1,
248                 .obsolete       = -1,
249                 .error          = -EINVAL,
250                 .input          = dst_discard,
251                 .output         = dst_discard,
252         },
253         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
254         .rt6i_protocol  = RTPROT_KERNEL,
255         .rt6i_metric    = ~(u32) 0,
256         .rt6i_ref       = ATOMIC_INIT(1),
257 };
258
259 #endif
260
261 /* allocate dst with ip6_dst_ops */
262 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
263                                              struct net_device *dev,
264                                              int flags)
265 {
266         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
267
268         if (rt)
269                 memset(&rt->rt6i_table, 0,
270                        sizeof(*rt) - sizeof(struct dst_entry));
271
272         return rt;
273 }
274
275 static void ip6_dst_destroy(struct dst_entry *dst)
276 {
277         struct rt6_info *rt = (struct rt6_info *)dst;
278         struct inet6_dev *idev = rt->rt6i_idev;
279         struct inet_peer *peer = rt->rt6i_peer;
280
281         if (!(rt->dst.flags & DST_HOST))
282                 dst_destroy_metrics_generic(dst);
283
284         if (idev) {
285                 rt->rt6i_idev = NULL;
286                 in6_dev_put(idev);
287         }
288
289         if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
290                 dst_release(dst->from);
291
292         if (peer) {
293                 rt->rt6i_peer = NULL;
294                 inet_putpeer(peer);
295         }
296 }
297
298 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
299
300 static u32 rt6_peer_genid(void)
301 {
302         return atomic_read(&__rt6_peer_genid);
303 }
304
305 void rt6_bind_peer(struct rt6_info *rt, int create)
306 {
307         struct inet_peer *peer;
308
309         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
310         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
311                 inet_putpeer(peer);
312         else
313                 rt->rt6i_peer_genid = rt6_peer_genid();
314 }
315
316 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
317                            int how)
318 {
319         struct rt6_info *rt = (struct rt6_info *)dst;
320         struct inet6_dev *idev = rt->rt6i_idev;
321         struct net_device *loopback_dev =
322                 dev_net(dev)->loopback_dev;
323
324         if (dev != loopback_dev && idev && idev->dev == dev) {
325                 struct inet6_dev *loopback_idev =
326                         in6_dev_get(loopback_dev);
327                 if (loopback_idev) {
328                         rt->rt6i_idev = loopback_idev;
329                         in6_dev_put(idev);
330                 }
331         }
332 }
333
334 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
335 {
336         struct rt6_info *ort = NULL;
337
338         if (rt->rt6i_flags & RTF_EXPIRES) {
339                 if (time_after(jiffies, rt->dst.expires))
340                         return 1;
341         } else if (rt->dst.from) {
342                 ort = (struct rt6_info *) rt->dst.from;
343                 return (ort->rt6i_flags & RTF_EXPIRES) &&
344                         time_after(jiffies, ort->dst.expires);
345         }
346         return 0;
347 }
348
349 static inline int rt6_need_strict(const struct in6_addr *daddr)
350 {
351         return ipv6_addr_type(daddr) &
352                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
353 }
354
355 /*
356  *      Route lookup. Any table->tb6_lock is implied.
357  */
358
359 static inline struct rt6_info *rt6_device_match(struct net *net,
360                                                     struct rt6_info *rt,
361                                                     const struct in6_addr *saddr,
362                                                     int oif,
363                                                     int flags)
364 {
365         struct rt6_info *local = NULL;
366         struct rt6_info *sprt;
367
368         if (!oif && ipv6_addr_any(saddr))
369                 goto out;
370
371         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
372                 struct net_device *dev = sprt->dst.dev;
373
374                 if (oif) {
375                         if (dev->ifindex == oif)
376                                 return sprt;
377                         if (dev->flags & IFF_LOOPBACK) {
378                                 if (!sprt->rt6i_idev ||
379                                     sprt->rt6i_idev->dev->ifindex != oif) {
380                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
381                                                 continue;
382                                         if (local && (!oif ||
383                                                       local->rt6i_idev->dev->ifindex == oif))
384                                                 continue;
385                                 }
386                                 local = sprt;
387                         }
388                 } else {
389                         if (ipv6_chk_addr(net, saddr, dev,
390                                           flags & RT6_LOOKUP_F_IFACE))
391                                 return sprt;
392                 }
393         }
394
395         if (oif) {
396                 if (local)
397                         return local;
398
399                 if (flags & RT6_LOOKUP_F_IFACE)
400                         return net->ipv6.ip6_null_entry;
401         }
402 out:
403         return rt;
404 }
405
406 #ifdef CONFIG_IPV6_ROUTER_PREF
407 static void rt6_probe(struct rt6_info *rt)
408 {
409         struct neighbour *neigh;
410         /*
411          * Okay, this does not seem to be appropriate
412          * for now, however, we need to check if it
413          * is really so; aka Router Reachability Probing.
414          *
415          * Router Reachability Probe MUST be rate-limited
416          * to no more than one per minute.
417          */
418         rcu_read_lock();
419         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
420         if (!neigh || (neigh->nud_state & NUD_VALID))
421                 goto out;
422         read_lock_bh(&neigh->lock);
423         if (!(neigh->nud_state & NUD_VALID) &&
424             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
425                 struct in6_addr mcaddr;
426                 struct in6_addr *target;
427
428                 neigh->updated = jiffies;
429                 read_unlock_bh(&neigh->lock);
430
431                 target = (struct in6_addr *)&neigh->primary_key;
432                 addrconf_addr_solict_mult(target, &mcaddr);
433                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
434         } else {
435                 read_unlock_bh(&neigh->lock);
436         }
437 out:
438         rcu_read_unlock();
439 }
440 #else
441 static inline void rt6_probe(struct rt6_info *rt)
442 {
443 }
444 #endif
445
446 /*
447  * Default Router Selection (RFC 2461 6.3.6)
448  */
449 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
450 {
451         struct net_device *dev = rt->dst.dev;
452         if (!oif || dev->ifindex == oif)
453                 return 2;
454         if ((dev->flags & IFF_LOOPBACK) &&
455             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
456                 return 1;
457         return 0;
458 }
459
460 static inline int rt6_check_neigh(struct rt6_info *rt)
461 {
462         struct neighbour *neigh;
463         int m;
464
465         rcu_read_lock();
466         neigh = dst_get_neighbour_noref(&rt->dst);
467         if (rt->rt6i_flags & RTF_NONEXTHOP ||
468             !(rt->rt6i_flags & RTF_GATEWAY))
469                 m = 1;
470         else if (neigh) {
471                 read_lock_bh(&neigh->lock);
472                 if (neigh->nud_state & NUD_VALID)
473                         m = 2;
474 #ifdef CONFIG_IPV6_ROUTER_PREF
475                 else if (neigh->nud_state & NUD_FAILED)
476                         m = 0;
477 #endif
478                 else
479                         m = 1;
480                 read_unlock_bh(&neigh->lock);
481         } else
482                 m = 0;
483         rcu_read_unlock();
484         return m;
485 }
486
487 static int rt6_score_route(struct rt6_info *rt, int oif,
488                            int strict)
489 {
490         int m, n;
491
492         m = rt6_check_dev(rt, oif);
493         if (!m && (strict & RT6_LOOKUP_F_IFACE))
494                 return -1;
495 #ifdef CONFIG_IPV6_ROUTER_PREF
496         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
497 #endif
498         n = rt6_check_neigh(rt);
499         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
500                 return -1;
501         return m;
502 }
503
504 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
505                                    int *mpri, struct rt6_info *match)
506 {
507         int m;
508
509         if (rt6_check_expired(rt))
510                 goto out;
511
512         m = rt6_score_route(rt, oif, strict);
513         if (m < 0)
514                 goto out;
515
516         if (m > *mpri) {
517                 if (strict & RT6_LOOKUP_F_REACHABLE)
518                         rt6_probe(match);
519                 *mpri = m;
520                 match = rt;
521         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
522                 rt6_probe(rt);
523         }
524
525 out:
526         return match;
527 }
528
529 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
530                                      struct rt6_info *rr_head,
531                                      u32 metric, int oif, int strict)
532 {
533         struct rt6_info *rt, *match;
534         int mpri = -1;
535
536         match = NULL;
537         for (rt = rr_head; rt && rt->rt6i_metric == metric;
538              rt = rt->dst.rt6_next)
539                 match = find_match(rt, oif, strict, &mpri, match);
540         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
541              rt = rt->dst.rt6_next)
542                 match = find_match(rt, oif, strict, &mpri, match);
543
544         return match;
545 }
546
547 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
548 {
549         struct rt6_info *match, *rt0;
550         struct net *net;
551
552         rt0 = fn->rr_ptr;
553         if (!rt0)
554                 fn->rr_ptr = rt0 = fn->leaf;
555
556         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
557
558         if (!match &&
559             (strict & RT6_LOOKUP_F_REACHABLE)) {
560                 struct rt6_info *next = rt0->dst.rt6_next;
561
562                 /* no entries matched; do round-robin */
563                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
564                         next = fn->leaf;
565
566                 if (next != rt0)
567                         fn->rr_ptr = next;
568         }
569
570         net = dev_net(rt0->dst.dev);
571         return match ? match : net->ipv6.ip6_null_entry;
572 }
573
574 #ifdef CONFIG_IPV6_ROUTE_INFO
575 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
576                   const struct in6_addr *gwaddr)
577 {
578         struct net *net = dev_net(dev);
579         struct route_info *rinfo = (struct route_info *) opt;
580         struct in6_addr prefix_buf, *prefix;
581         unsigned int pref;
582         unsigned long lifetime;
583         struct rt6_info *rt;
584
585         if (len < sizeof(struct route_info)) {
586                 return -EINVAL;
587         }
588
589         /* Sanity check for prefix_len and length */
590         if (rinfo->length > 3) {
591                 return -EINVAL;
592         } else if (rinfo->prefix_len > 128) {
593                 return -EINVAL;
594         } else if (rinfo->prefix_len > 64) {
595                 if (rinfo->length < 2) {
596                         return -EINVAL;
597                 }
598         } else if (rinfo->prefix_len > 0) {
599                 if (rinfo->length < 1) {
600                         return -EINVAL;
601                 }
602         }
603
604         pref = rinfo->route_pref;
605         if (pref == ICMPV6_ROUTER_PREF_INVALID)
606                 return -EINVAL;
607
608         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
609
610         if (rinfo->length == 3)
611                 prefix = (struct in6_addr *)rinfo->prefix;
612         else {
613                 /* this function is safe */
614                 ipv6_addr_prefix(&prefix_buf,
615                                  (struct in6_addr *)rinfo->prefix,
616                                  rinfo->prefix_len);
617                 prefix = &prefix_buf;
618         }
619
620         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
621                                 dev->ifindex);
622
623         if (rt && !lifetime) {
624                 ip6_del_rt(rt);
625                 rt = NULL;
626         }
627
628         if (!rt && lifetime)
629                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
630                                         pref);
631         else if (rt)
632                 rt->rt6i_flags = RTF_ROUTEINFO |
633                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
634
635         if (rt) {
636                 if (!addrconf_finite_timeout(lifetime))
637                         rt6_clean_expires(rt);
638                 else
639                         rt6_set_expires(rt, jiffies + HZ * lifetime);
640
641                 dst_release(&rt->dst);
642         }
643         return 0;
644 }
645 #endif
646
647 #define BACKTRACK(__net, saddr)                 \
648 do { \
649         if (rt == __net->ipv6.ip6_null_entry) { \
650                 struct fib6_node *pn; \
651                 while (1) { \
652                         if (fn->fn_flags & RTN_TL_ROOT) \
653                                 goto out; \
654                         pn = fn->parent; \
655                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
656                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
657                         else \
658                                 fn = pn; \
659                         if (fn->fn_flags & RTN_RTINFO) \
660                                 goto restart; \
661                 } \
662         } \
663 } while (0)
664
665 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
666                                              struct fib6_table *table,
667                                              struct flowi6 *fl6, int flags)
668 {
669         struct fib6_node *fn;
670         struct rt6_info *rt;
671
672         read_lock_bh(&table->tb6_lock);
673         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
674 restart:
675         rt = fn->leaf;
676         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
677         BACKTRACK(net, &fl6->saddr);
678 out:
679         dst_use(&rt->dst, jiffies);
680         read_unlock_bh(&table->tb6_lock);
681         return rt;
682
683 }
684
685 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
686                                     int flags)
687 {
688         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
689 }
690 EXPORT_SYMBOL_GPL(ip6_route_lookup);
691
692 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
693                             const struct in6_addr *saddr, int oif, int strict)
694 {
695         struct flowi6 fl6 = {
696                 .flowi6_oif = oif,
697                 .daddr = *daddr,
698         };
699         struct dst_entry *dst;
700         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
701
702         if (saddr) {
703                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
704                 flags |= RT6_LOOKUP_F_HAS_SADDR;
705         }
706
707         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
708         if (dst->error == 0)
709                 return (struct rt6_info *) dst;
710
711         dst_release(dst);
712
713         return NULL;
714 }
715
716 EXPORT_SYMBOL(rt6_lookup);
717
718 /* ip6_ins_rt is called with FREE table->tb6_lock.
719    It takes new route entry, the addition fails by any reason the
720    route is freed. In any case, if caller does not hold it, it may
721    be destroyed.
722  */
723
724 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
725 {
726         int err;
727         struct fib6_table *table;
728
729         table = rt->rt6i_table;
730         write_lock_bh(&table->tb6_lock);
731         err = fib6_add(&table->tb6_root, rt, info);
732         write_unlock_bh(&table->tb6_lock);
733
734         return err;
735 }
736
737 int ip6_ins_rt(struct rt6_info *rt)
738 {
739         struct nl_info info = {
740                 .nl_net = dev_net(rt->dst.dev),
741         };
742         return __ip6_ins_rt(rt, &info);
743 }
744
745 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
746                                       const struct in6_addr *daddr,
747                                       const struct in6_addr *saddr)
748 {
749         struct rt6_info *rt;
750
751         /*
752          *      Clone the route.
753          */
754
755         rt = ip6_rt_copy(ort, daddr);
756
757         if (rt) {
758                 int attempts = !in_softirq();
759
760                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
761                         if (ort->rt6i_dst.plen != 128 &&
762                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
763                                 rt->rt6i_flags |= RTF_ANYCAST;
764                         rt->rt6i_gateway = *daddr;
765                 }
766
767                 rt->rt6i_flags |= RTF_CACHE;
768
769 #ifdef CONFIG_IPV6_SUBTREES
770                 if (rt->rt6i_src.plen && saddr) {
771                         rt->rt6i_src.addr = *saddr;
772                         rt->rt6i_src.plen = 128;
773                 }
774 #endif
775
776         retry:
777                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
778                         struct net *net = dev_net(rt->dst.dev);
779                         int saved_rt_min_interval =
780                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
781                         int saved_rt_elasticity =
782                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
783
784                         if (attempts-- > 0) {
785                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
786                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
787
788                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
789
790                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
791                                         saved_rt_elasticity;
792                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
793                                         saved_rt_min_interval;
794                                 goto retry;
795                         }
796
797                         if (net_ratelimit())
798                                 printk(KERN_WARNING
799                                        "ipv6: Neighbour table overflow.\n");
800                         dst_free(&rt->dst);
801                         return NULL;
802                 }
803         }
804
805         return rt;
806 }
807
808 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
809                                         const struct in6_addr *daddr)
810 {
811         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
812
813         if (rt) {
814                 rt->rt6i_flags |= RTF_CACHE;
815                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
816         }
817         return rt;
818 }
819
820 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
821                                       struct flowi6 *fl6, int flags)
822 {
823         struct fib6_node *fn;
824         struct rt6_info *rt, *nrt;
825         int strict = 0;
826         int attempts = 3;
827         int err;
828         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
829
830         strict |= flags & RT6_LOOKUP_F_IFACE;
831
832 relookup:
833         read_lock_bh(&table->tb6_lock);
834
835 restart_2:
836         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
837
838 restart:
839         rt = rt6_select(fn, oif, strict | reachable);
840
841         BACKTRACK(net, &fl6->saddr);
842         if (rt == net->ipv6.ip6_null_entry ||
843             rt->rt6i_flags & RTF_CACHE)
844                 goto out;
845
846         dst_hold(&rt->dst);
847         read_unlock_bh(&table->tb6_lock);
848
849         if (!dst_get_neighbour_noref_raw(&rt->dst) &&
850             !(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_LOCAL)))
851                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
852         else if (!(rt->dst.flags & DST_HOST))
853                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
854         else
855                 goto out2;
856
857         dst_release(&rt->dst);
858         rt = nrt ? : net->ipv6.ip6_null_entry;
859
860         dst_hold(&rt->dst);
861         if (nrt) {
862                 err = ip6_ins_rt(nrt);
863                 if (!err)
864                         goto out2;
865         }
866
867         if (--attempts <= 0)
868                 goto out2;
869
870         /*
871          * Race condition! In the gap, when table->tb6_lock was
872          * released someone could insert this route.  Relookup.
873          */
874         dst_release(&rt->dst);
875         goto relookup;
876
877 out:
878         if (reachable) {
879                 reachable = 0;
880                 goto restart_2;
881         }
882         dst_hold(&rt->dst);
883         read_unlock_bh(&table->tb6_lock);
884 out2:
885         rt->dst.lastuse = jiffies;
886         rt->dst.__use++;
887
888         return rt;
889 }
890
891 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
892                                             struct flowi6 *fl6, int flags)
893 {
894         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
895 }
896
897 static struct dst_entry *ip6_route_input_lookup(struct net *net,
898                                                 struct net_device *dev,
899                                                 struct flowi6 *fl6, int flags)
900 {
901         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
902                 flags |= RT6_LOOKUP_F_IFACE;
903
904         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
905 }
906
907 void ip6_route_input(struct sk_buff *skb)
908 {
909         const struct ipv6hdr *iph = ipv6_hdr(skb);
910         struct net *net = dev_net(skb->dev);
911         int flags = RT6_LOOKUP_F_HAS_SADDR;
912         struct flowi6 fl6 = {
913                 .flowi6_iif = skb->dev->ifindex,
914                 .daddr = iph->daddr,
915                 .saddr = iph->saddr,
916                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
917                 .flowi6_mark = skb->mark,
918                 .flowi6_proto = iph->nexthdr,
919         };
920
921         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
922 }
923
924 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
925                                              struct flowi6 *fl6, int flags)
926 {
927         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
928 }
929
930 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
931                                     struct flowi6 *fl6)
932 {
933         int flags = 0;
934
935         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
936                 flags |= RT6_LOOKUP_F_IFACE;
937
938         if (!ipv6_addr_any(&fl6->saddr))
939                 flags |= RT6_LOOKUP_F_HAS_SADDR;
940         else if (sk)
941                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
942
943         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
944 }
945
946 EXPORT_SYMBOL(ip6_route_output);
947
948 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
949 {
950         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
951         struct dst_entry *new = NULL;
952
953         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
954         if (rt) {
955                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
956
957                 new = &rt->dst;
958
959                 new->__use = 1;
960                 new->input = dst_discard;
961                 new->output = dst_discard;
962
963                 if (dst_metrics_read_only(&ort->dst))
964                         new->_metrics = ort->dst._metrics;
965                 else
966                         dst_copy_metrics(new, &ort->dst);
967                 rt->rt6i_idev = ort->rt6i_idev;
968                 if (rt->rt6i_idev)
969                         in6_dev_hold(rt->rt6i_idev);
970
971                 rt->rt6i_gateway = ort->rt6i_gateway;
972                 rt->rt6i_flags = ort->rt6i_flags;
973                 rt6_clean_expires(rt);
974                 rt->rt6i_metric = 0;
975
976                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
977 #ifdef CONFIG_IPV6_SUBTREES
978                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
979 #endif
980
981                 dst_free(new);
982         }
983
984         dst_release(dst_orig);
985         return new ? new : ERR_PTR(-ENOMEM);
986 }
987
988 /*
989  *      Destination cache support functions
990  */
991
992 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
993 {
994         struct rt6_info *rt;
995
996         rt = (struct rt6_info *) dst;
997
998         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
999                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1000                         if (!rt->rt6i_peer)
1001                                 rt6_bind_peer(rt, 0);
1002                         rt->rt6i_peer_genid = rt6_peer_genid();
1003                 }
1004                 return dst;
1005         }
1006         return NULL;
1007 }
1008
1009 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1010 {
1011         struct rt6_info *rt = (struct rt6_info *) dst;
1012
1013         if (rt) {
1014                 if (rt->rt6i_flags & RTF_CACHE) {
1015                         if (rt6_check_expired(rt)) {
1016                                 ip6_del_rt(rt);
1017                                 dst = NULL;
1018                         }
1019                 } else {
1020                         dst_release(dst);
1021                         dst = NULL;
1022                 }
1023         }
1024         return dst;
1025 }
1026
1027 static void ip6_link_failure(struct sk_buff *skb)
1028 {
1029         struct rt6_info *rt;
1030
1031         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1032
1033         rt = (struct rt6_info *) skb_dst(skb);
1034         if (rt) {
1035                 if (rt->rt6i_flags & RTF_CACHE)
1036                         rt6_update_expires(rt, 0);
1037                 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1038                         rt->rt6i_node->fn_sernum = -1;
1039         }
1040 }
1041
1042 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1043 {
1044         struct rt6_info *rt6 = (struct rt6_info*)dst;
1045
1046         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1047                 rt6->rt6i_flags |= RTF_MODIFIED;
1048                 if (mtu < IPV6_MIN_MTU) {
1049                         u32 features = dst_metric(dst, RTAX_FEATURES);
1050                         mtu = IPV6_MIN_MTU;
1051                         features |= RTAX_FEATURE_ALLFRAG;
1052                         dst_metric_set(dst, RTAX_FEATURES, features);
1053                 }
1054                 dst_metric_set(dst, RTAX_MTU, mtu);
1055         }
1056 }
1057
1058 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1059 {
1060         struct net_device *dev = dst->dev;
1061         unsigned int mtu = dst_mtu(dst);
1062         struct net *net = dev_net(dev);
1063
1064         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1065
1066         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1067                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1068
1069         /*
1070          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1071          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1072          * IPV6_MAXPLEN is also valid and means: "any MSS,
1073          * rely only on pmtu discovery"
1074          */
1075         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1076                 mtu = IPV6_MAXPLEN;
1077         return mtu;
1078 }
1079
1080 static unsigned int ip6_mtu(const struct dst_entry *dst)
1081 {
1082         struct inet6_dev *idev;
1083         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1084
1085         if (mtu)
1086                 return mtu;
1087
1088         mtu = IPV6_MIN_MTU;
1089
1090         rcu_read_lock();
1091         idev = __in6_dev_get(dst->dev);
1092         if (idev)
1093                 mtu = idev->cnf.mtu6;
1094         rcu_read_unlock();
1095
1096         return mtu;
1097 }
1098
1099 static struct dst_entry *icmp6_dst_gc_list;
1100 static DEFINE_SPINLOCK(icmp6_dst_lock);
1101
1102 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1103                                   struct neighbour *neigh,
1104                                   struct flowi6 *fl6)
1105 {
1106         struct dst_entry *dst;
1107         struct rt6_info *rt;
1108         struct inet6_dev *idev = in6_dev_get(dev);
1109         struct net *net = dev_net(dev);
1110
1111         if (unlikely(!idev))
1112                 return ERR_PTR(-ENODEV);
1113
1114         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1115         if (unlikely(!rt)) {
1116                 in6_dev_put(idev);
1117                 dst = ERR_PTR(-ENOMEM);
1118                 goto out;
1119         }
1120
1121         if (neigh)
1122                 neigh_hold(neigh);
1123         else {
1124                 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1125                 if (IS_ERR(neigh)) {
1126                         in6_dev_put(idev);
1127                         dst_free(&rt->dst);
1128                         return ERR_CAST(neigh);
1129                 }
1130         }
1131
1132         rt->dst.flags |= DST_HOST;
1133         rt->dst.output  = ip6_output;
1134         dst_set_neighbour(&rt->dst, neigh);
1135         atomic_set(&rt->dst.__refcnt, 1);
1136         rt->rt6i_dst.addr = fl6->daddr;
1137         rt->rt6i_dst.plen = 128;
1138         rt->rt6i_idev     = idev;
1139         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1140
1141         spin_lock_bh(&icmp6_dst_lock);
1142         rt->dst.next = icmp6_dst_gc_list;
1143         icmp6_dst_gc_list = &rt->dst;
1144         spin_unlock_bh(&icmp6_dst_lock);
1145
1146         fib6_force_start_gc(net);
1147
1148         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1149
1150 out:
1151         return dst;
1152 }
1153
1154 int icmp6_dst_gc(void)
1155 {
1156         struct dst_entry *dst, **pprev;
1157         int more = 0;
1158
1159         spin_lock_bh(&icmp6_dst_lock);
1160         pprev = &icmp6_dst_gc_list;
1161
1162         while ((dst = *pprev) != NULL) {
1163                 if (!atomic_read(&dst->__refcnt)) {
1164                         *pprev = dst->next;
1165                         dst_free(dst);
1166                 } else {
1167                         pprev = &dst->next;
1168                         ++more;
1169                 }
1170         }
1171
1172         spin_unlock_bh(&icmp6_dst_lock);
1173
1174         return more;
1175 }
1176
1177 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1178                             void *arg)
1179 {
1180         struct dst_entry *dst, **pprev;
1181
1182         spin_lock_bh(&icmp6_dst_lock);
1183         pprev = &icmp6_dst_gc_list;
1184         while ((dst = *pprev) != NULL) {
1185                 struct rt6_info *rt = (struct rt6_info *) dst;
1186                 if (func(rt, arg)) {
1187                         *pprev = dst->next;
1188                         dst_free(dst);
1189                 } else {
1190                         pprev = &dst->next;
1191                 }
1192         }
1193         spin_unlock_bh(&icmp6_dst_lock);
1194 }
1195
1196 static int ip6_dst_gc(struct dst_ops *ops)
1197 {
1198         unsigned long now = jiffies;
1199         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1200         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1201         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1202         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1203         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1204         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1205         int entries;
1206
1207         entries = dst_entries_get_fast(ops);
1208         if (time_after(rt_last_gc + rt_min_interval, now) &&
1209             entries <= rt_max_size)
1210                 goto out;
1211
1212         net->ipv6.ip6_rt_gc_expire++;
1213         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1214         net->ipv6.ip6_rt_last_gc = now;
1215         entries = dst_entries_get_slow(ops);
1216         if (entries < ops->gc_thresh)
1217                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1218 out:
1219         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1220         return entries > rt_max_size;
1221 }
1222
1223 /* Clean host part of a prefix. Not necessary in radix tree,
1224    but results in cleaner routing tables.
1225
1226    Remove it only when all the things will work!
1227  */
1228
1229 int ip6_dst_hoplimit(struct dst_entry *dst)
1230 {
1231         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1232         if (hoplimit == 0) {
1233                 struct net_device *dev = dst->dev;
1234                 struct inet6_dev *idev;
1235
1236                 rcu_read_lock();
1237                 idev = __in6_dev_get(dev);
1238                 if (idev)
1239                         hoplimit = idev->cnf.hop_limit;
1240                 else
1241                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1242                 rcu_read_unlock();
1243         }
1244         return hoplimit;
1245 }
1246 EXPORT_SYMBOL(ip6_dst_hoplimit);
1247
1248 /*
1249  *
1250  */
1251
1252 int ip6_route_add(struct fib6_config *cfg)
1253 {
1254         int err;
1255         struct net *net = cfg->fc_nlinfo.nl_net;
1256         struct rt6_info *rt = NULL;
1257         struct net_device *dev = NULL;
1258         struct inet6_dev *idev = NULL;
1259         struct fib6_table *table;
1260         int addr_type;
1261
1262         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1263                 return -EINVAL;
1264 #ifndef CONFIG_IPV6_SUBTREES
1265         if (cfg->fc_src_len)
1266                 return -EINVAL;
1267 #endif
1268         if (cfg->fc_ifindex) {
1269                 err = -ENODEV;
1270                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1271                 if (!dev)
1272                         goto out;
1273                 idev = in6_dev_get(dev);
1274                 if (!idev)
1275                         goto out;
1276         }
1277
1278         if (cfg->fc_metric == 0)
1279                 cfg->fc_metric = IP6_RT_PRIO_USER;
1280
1281         err = -ENOBUFS;
1282         if (cfg->fc_nlinfo.nlh &&
1283             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1284                 table = fib6_get_table(net, cfg->fc_table);
1285                 if (!table) {
1286                         printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1287                         table = fib6_new_table(net, cfg->fc_table);
1288                 }
1289         } else {
1290                 table = fib6_new_table(net, cfg->fc_table);
1291         }
1292
1293         if (!table)
1294                 goto out;
1295
1296         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1297
1298         if (!rt) {
1299                 err = -ENOMEM;
1300                 goto out;
1301         }
1302
1303         rt->dst.obsolete = -1;
1304
1305         if (cfg->fc_flags & RTF_EXPIRES)
1306                 rt6_set_expires(rt, jiffies +
1307                                 clock_t_to_jiffies(cfg->fc_expires));
1308         else
1309                 rt6_clean_expires(rt);
1310
1311         if (cfg->fc_protocol == RTPROT_UNSPEC)
1312                 cfg->fc_protocol = RTPROT_BOOT;
1313         rt->rt6i_protocol = cfg->fc_protocol;
1314
1315         addr_type = ipv6_addr_type(&cfg->fc_dst);
1316
1317         if (addr_type & IPV6_ADDR_MULTICAST)
1318                 rt->dst.input = ip6_mc_input;
1319         else if (cfg->fc_flags & RTF_LOCAL)
1320                 rt->dst.input = ip6_input;
1321         else
1322                 rt->dst.input = ip6_forward;
1323
1324         rt->dst.output = ip6_output;
1325
1326         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1327         rt->rt6i_dst.plen = cfg->fc_dst_len;
1328         if (rt->rt6i_dst.plen == 128)
1329                rt->dst.flags |= DST_HOST;
1330
1331         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1332                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1333                 if (!metrics) {
1334                         err = -ENOMEM;
1335                         goto out;
1336                 }
1337                 dst_init_metrics(&rt->dst, metrics, 0);
1338         }
1339 #ifdef CONFIG_IPV6_SUBTREES
1340         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1341         rt->rt6i_src.plen = cfg->fc_src_len;
1342 #endif
1343
1344         rt->rt6i_metric = cfg->fc_metric;
1345
1346         /* We cannot add true routes via loopback here,
1347            they would result in kernel looping; promote them to reject routes
1348          */
1349         if ((cfg->fc_flags & RTF_REJECT) ||
1350             (dev && (dev->flags & IFF_LOOPBACK) &&
1351              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1352              !(cfg->fc_flags & RTF_LOCAL))) {
1353                 /* hold loopback dev/idev if we haven't done so. */
1354                 if (dev != net->loopback_dev) {
1355                         if (dev) {
1356                                 dev_put(dev);
1357                                 in6_dev_put(idev);
1358                         }
1359                         dev = net->loopback_dev;
1360                         dev_hold(dev);
1361                         idev = in6_dev_get(dev);
1362                         if (!idev) {
1363                                 err = -ENODEV;
1364                                 goto out;
1365                         }
1366                 }
1367                 rt->dst.output = ip6_pkt_discard_out;
1368                 rt->dst.input = ip6_pkt_discard;
1369                 rt->dst.error = -ENETUNREACH;
1370                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1371                 goto install_route;
1372         }
1373
1374         if (cfg->fc_flags & RTF_GATEWAY) {
1375                 const struct in6_addr *gw_addr;
1376                 int gwa_type;
1377
1378                 gw_addr = &cfg->fc_gateway;
1379                 rt->rt6i_gateway = *gw_addr;
1380                 gwa_type = ipv6_addr_type(gw_addr);
1381
1382                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1383                         struct rt6_info *grt;
1384
1385                         /* IPv6 strictly inhibits using not link-local
1386                            addresses as nexthop address.
1387                            Otherwise, router will not able to send redirects.
1388                            It is very good, but in some (rare!) circumstances
1389                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1390                            some exceptions. --ANK
1391                          */
1392                         err = -EINVAL;
1393                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1394                                 goto out;
1395
1396                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1397
1398                         err = -EHOSTUNREACH;
1399                         if (!grt)
1400                                 goto out;
1401                         if (dev) {
1402                                 if (dev != grt->dst.dev) {
1403                                         dst_release(&grt->dst);
1404                                         goto out;
1405                                 }
1406                         } else {
1407                                 dev = grt->dst.dev;
1408                                 idev = grt->rt6i_idev;
1409                                 dev_hold(dev);
1410                                 in6_dev_hold(grt->rt6i_idev);
1411                         }
1412                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1413                                 err = 0;
1414                         dst_release(&grt->dst);
1415
1416                         if (err)
1417                                 goto out;
1418                 }
1419                 err = -EINVAL;
1420                 if (!dev || (dev->flags & IFF_LOOPBACK))
1421                         goto out;
1422         }
1423
1424         err = -ENODEV;
1425         if (!dev)
1426                 goto out;
1427
1428         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1429                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1430                         err = -EINVAL;
1431                         goto out;
1432                 }
1433                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1434                 rt->rt6i_prefsrc.plen = 128;
1435         } else
1436                 rt->rt6i_prefsrc.plen = 0;
1437
1438         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1439                 err = rt6_bind_neighbour(rt, dev);
1440                 if (err)
1441                         goto out;
1442         }
1443
1444         rt->rt6i_flags = cfg->fc_flags;
1445
1446 install_route:
1447         if (cfg->fc_mx) {
1448                 struct nlattr *nla;
1449                 int remaining;
1450
1451                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1452                         int type = nla_type(nla);
1453
1454                         if (type) {
1455                                 if (type > RTAX_MAX) {
1456                                         err = -EINVAL;
1457                                         goto out;
1458                                 }
1459
1460                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1461                         }
1462                 }
1463         }
1464
1465         rt->dst.dev = dev;
1466         rt->rt6i_idev = idev;
1467         rt->rt6i_table = table;
1468
1469         cfg->fc_nlinfo.nl_net = dev_net(dev);
1470
1471         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1472
1473 out:
1474         if (dev)
1475                 dev_put(dev);
1476         if (idev)
1477                 in6_dev_put(idev);
1478         if (rt)
1479                 dst_free(&rt->dst);
1480         return err;
1481 }
1482
1483 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1484 {
1485         int err;
1486         struct fib6_table *table;
1487         struct net *net = dev_net(rt->dst.dev);
1488
1489         if (rt == net->ipv6.ip6_null_entry) {
1490                 err = -ENOENT;
1491                 goto out;
1492         }
1493
1494         table = rt->rt6i_table;
1495         write_lock_bh(&table->tb6_lock);
1496         err = fib6_del(rt, info);
1497         write_unlock_bh(&table->tb6_lock);
1498
1499 out:
1500         dst_release(&rt->dst);
1501         return err;
1502 }
1503
1504 int ip6_del_rt(struct rt6_info *rt)
1505 {
1506         struct nl_info info = {
1507                 .nl_net = dev_net(rt->dst.dev),
1508         };
1509         return __ip6_del_rt(rt, &info);
1510 }
1511
1512 static int ip6_route_del(struct fib6_config *cfg)
1513 {
1514         struct fib6_table *table;
1515         struct fib6_node *fn;
1516         struct rt6_info *rt;
1517         int err = -ESRCH;
1518
1519         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1520         if (!table)
1521                 return err;
1522
1523         read_lock_bh(&table->tb6_lock);
1524
1525         fn = fib6_locate(&table->tb6_root,
1526                          &cfg->fc_dst, cfg->fc_dst_len,
1527                          &cfg->fc_src, cfg->fc_src_len);
1528
1529         if (fn) {
1530                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1531                         if (cfg->fc_ifindex &&
1532                             (!rt->dst.dev ||
1533                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1534                                 continue;
1535                         if (cfg->fc_flags & RTF_GATEWAY &&
1536                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1537                                 continue;
1538                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1539                                 continue;
1540                         dst_hold(&rt->dst);
1541                         read_unlock_bh(&table->tb6_lock);
1542
1543                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1544                 }
1545         }
1546         read_unlock_bh(&table->tb6_lock);
1547
1548         return err;
1549 }
1550
1551 /*
1552  *      Handle redirects
1553  */
1554 struct ip6rd_flowi {
1555         struct flowi6 fl6;
1556         struct in6_addr gateway;
1557 };
1558
1559 static struct rt6_info *__ip6_route_redirect(struct net *net,
1560                                              struct fib6_table *table,
1561                                              struct flowi6 *fl6,
1562                                              int flags)
1563 {
1564         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1565         struct rt6_info *rt;
1566         struct fib6_node *fn;
1567
1568         /*
1569          * Get the "current" route for this destination and
1570          * check if the redirect has come from approriate router.
1571          *
1572          * RFC 2461 specifies that redirects should only be
1573          * accepted if they come from the nexthop to the target.
1574          * Due to the way the routes are chosen, this notion
1575          * is a bit fuzzy and one might need to check all possible
1576          * routes.
1577          */
1578
1579         read_lock_bh(&table->tb6_lock);
1580         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1581 restart:
1582         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1583                 /*
1584                  * Current route is on-link; redirect is always invalid.
1585                  *
1586                  * Seems, previous statement is not true. It could
1587                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1588                  * But then router serving it might decide, that we should
1589                  * know truth 8)8) --ANK (980726).
1590                  */
1591                 if (rt6_check_expired(rt))
1592                         continue;
1593                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1594                         continue;
1595                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1596                         continue;
1597                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1598                         continue;
1599                 break;
1600         }
1601
1602         if (!rt)
1603                 rt = net->ipv6.ip6_null_entry;
1604         BACKTRACK(net, &fl6->saddr);
1605 out:
1606         dst_hold(&rt->dst);
1607
1608         read_unlock_bh(&table->tb6_lock);
1609
1610         return rt;
1611 };
1612
1613 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1614                                            const struct in6_addr *src,
1615                                            const struct in6_addr *gateway,
1616                                            struct net_device *dev)
1617 {
1618         int flags = RT6_LOOKUP_F_HAS_SADDR;
1619         struct net *net = dev_net(dev);
1620         struct ip6rd_flowi rdfl = {
1621                 .fl6 = {
1622                         .flowi6_oif = dev->ifindex,
1623                         .daddr = *dest,
1624                         .saddr = *src,
1625                 },
1626         };
1627
1628         rdfl.gateway = *gateway;
1629
1630         if (rt6_need_strict(dest))
1631                 flags |= RT6_LOOKUP_F_IFACE;
1632
1633         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1634                                                    flags, __ip6_route_redirect);
1635 }
1636
1637 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1638                   const struct in6_addr *saddr,
1639                   struct neighbour *neigh, u8 *lladdr, int on_link)
1640 {
1641         struct rt6_info *rt, *nrt = NULL;
1642         struct netevent_redirect netevent;
1643         struct net *net = dev_net(neigh->dev);
1644
1645         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1646
1647         if (rt == net->ipv6.ip6_null_entry) {
1648                 if (net_ratelimit())
1649                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1650                                "for redirect target\n");
1651                 goto out;
1652         }
1653
1654         /*
1655          *      We have finally decided to accept it.
1656          */
1657
1658         neigh_update(neigh, lladdr, NUD_STALE,
1659                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1660                      NEIGH_UPDATE_F_OVERRIDE|
1661                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1662                                      NEIGH_UPDATE_F_ISROUTER))
1663                      );
1664
1665         /*
1666          * Redirect received -> path was valid.
1667          * Look, redirects are sent only in response to data packets,
1668          * so that this nexthop apparently is reachable. --ANK
1669          */
1670         dst_confirm(&rt->dst);
1671
1672         /* Duplicate redirect: silently ignore. */
1673         if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1674                 goto out;
1675
1676         nrt = ip6_rt_copy(rt, dest);
1677         if (!nrt)
1678                 goto out;
1679
1680         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1681         if (on_link)
1682                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1683
1684         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1685         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1686
1687         if (ip6_ins_rt(nrt))
1688                 goto out;
1689
1690         netevent.old = &rt->dst;
1691         netevent.new = &nrt->dst;
1692         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1693
1694         if (rt->rt6i_flags & RTF_CACHE) {
1695                 ip6_del_rt(rt);
1696                 return;
1697         }
1698
1699 out:
1700         dst_release(&rt->dst);
1701 }
1702
1703 /*
1704  *      Handle ICMP "packet too big" messages
1705  *      i.e. Path MTU discovery
1706  */
1707
1708 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1709                              struct net *net, u32 pmtu, int ifindex)
1710 {
1711         struct rt6_info *rt, *nrt;
1712         int allfrag = 0;
1713 again:
1714         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1715         if (!rt)
1716                 return;
1717
1718         if (rt6_check_expired(rt)) {
1719                 ip6_del_rt(rt);
1720                 goto again;
1721         }
1722
1723         if (pmtu >= dst_mtu(&rt->dst))
1724                 goto out;
1725
1726         if (pmtu < IPV6_MIN_MTU) {
1727                 /*
1728                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1729                  * MTU (1280) and a fragment header should always be included
1730                  * after a node receiving Too Big message reporting PMTU is
1731                  * less than the IPv6 Minimum Link MTU.
1732                  */
1733                 pmtu = IPV6_MIN_MTU;
1734                 allfrag = 1;
1735         }
1736
1737         /* New mtu received -> path was valid.
1738            They are sent only in response to data packets,
1739            so that this nexthop apparently is reachable. --ANK
1740          */
1741         dst_confirm(&rt->dst);
1742
1743         /* Host route. If it is static, it would be better
1744            not to override it, but add new one, so that
1745            when cache entry will expire old pmtu
1746            would return automatically.
1747          */
1748         if (rt->rt6i_flags & RTF_CACHE) {
1749                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1750                 if (allfrag) {
1751                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1752                         features |= RTAX_FEATURE_ALLFRAG;
1753                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1754                 }
1755                 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1756                 rt->rt6i_flags |= RTF_MODIFIED;
1757                 goto out;
1758         }
1759
1760         /* Network route.
1761            Two cases are possible:
1762            1. It is connected route. Action: COW
1763            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1764          */
1765         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1766                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1767         else
1768                 nrt = rt6_alloc_clone(rt, daddr);
1769
1770         if (nrt) {
1771                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1772                 if (allfrag) {
1773                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1774                         features |= RTAX_FEATURE_ALLFRAG;
1775                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1776                 }
1777
1778                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1779                  * happened within 5 mins, the recommended timer is 10 mins.
1780                  * Here this route expiration time is set to ip6_rt_mtu_expires
1781                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1782                  * and detecting PMTU increase will be automatically happened.
1783                  */
1784                 rt6_update_expires(nrt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1785                 nrt->rt6i_flags |= RTF_DYNAMIC;
1786                 ip6_ins_rt(nrt);
1787         }
1788 out:
1789         dst_release(&rt->dst);
1790 }
1791
1792 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1793                         struct net_device *dev, u32 pmtu)
1794 {
1795         struct net *net = dev_net(dev);
1796
1797         /*
1798          * RFC 1981 states that a node "MUST reduce the size of the packets it
1799          * is sending along the path" that caused the Packet Too Big message.
1800          * Since it's not possible in the general case to determine which
1801          * interface was used to send the original packet, we update the MTU
1802          * on the interface that will be used to send future packets. We also
1803          * update the MTU on the interface that received the Packet Too Big in
1804          * case the original packet was forced out that interface with
1805          * SO_BINDTODEVICE or similar. This is the next best thing to the
1806          * correct behaviour, which would be to update the MTU on all
1807          * interfaces.
1808          */
1809         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1810         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1811 }
1812
1813 /*
1814  *      Misc support functions
1815  */
1816
1817 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1818                                     const struct in6_addr *dest)
1819 {
1820         struct net *net = dev_net(ort->dst.dev);
1821         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1822                                             ort->dst.dev, 0);
1823
1824         if (rt) {
1825                 rt->dst.input = ort->dst.input;
1826                 rt->dst.output = ort->dst.output;
1827                 rt->dst.flags |= DST_HOST;
1828
1829                 rt->rt6i_dst.addr = *dest;
1830                 rt->rt6i_dst.plen = 128;
1831                 dst_copy_metrics(&rt->dst, &ort->dst);
1832                 rt->dst.error = ort->dst.error;
1833                 rt->rt6i_idev = ort->rt6i_idev;
1834                 if (rt->rt6i_idev)
1835                         in6_dev_hold(rt->rt6i_idev);
1836                 rt->dst.lastuse = jiffies;
1837
1838                 rt->rt6i_gateway = ort->rt6i_gateway;
1839                 rt->rt6i_flags = ort->rt6i_flags;
1840                 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1841                     (RTF_DEFAULT | RTF_ADDRCONF))
1842                         rt6_set_from(rt, ort);
1843                 else
1844                         rt6_clean_expires(rt);
1845                 rt->rt6i_metric = 0;
1846
1847 #ifdef CONFIG_IPV6_SUBTREES
1848                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1849 #endif
1850                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1851                 rt->rt6i_table = ort->rt6i_table;
1852         }
1853         return rt;
1854 }
1855
1856 #ifdef CONFIG_IPV6_ROUTE_INFO
1857 static struct rt6_info *rt6_get_route_info(struct net *net,
1858                                            const struct in6_addr *prefix, int prefixlen,
1859                                            const struct in6_addr *gwaddr, int ifindex)
1860 {
1861         struct fib6_node *fn;
1862         struct rt6_info *rt = NULL;
1863         struct fib6_table *table;
1864
1865         table = fib6_get_table(net, RT6_TABLE_INFO);
1866         if (!table)
1867                 return NULL;
1868
1869         write_lock_bh(&table->tb6_lock);
1870         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1871         if (!fn)
1872                 goto out;
1873
1874         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1875                 if (rt->dst.dev->ifindex != ifindex)
1876                         continue;
1877                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1878                         continue;
1879                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1880                         continue;
1881                 dst_hold(&rt->dst);
1882                 break;
1883         }
1884 out:
1885         write_unlock_bh(&table->tb6_lock);
1886         return rt;
1887 }
1888
1889 static struct rt6_info *rt6_add_route_info(struct net *net,
1890                                            const struct in6_addr *prefix, int prefixlen,
1891                                            const struct in6_addr *gwaddr, int ifindex,
1892                                            unsigned pref)
1893 {
1894         struct fib6_config cfg = {
1895                 .fc_table       = RT6_TABLE_INFO,
1896                 .fc_metric      = IP6_RT_PRIO_USER,
1897                 .fc_ifindex     = ifindex,
1898                 .fc_dst_len     = prefixlen,
1899                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1900                                   RTF_UP | RTF_PREF(pref),
1901                 .fc_nlinfo.pid = 0,
1902                 .fc_nlinfo.nlh = NULL,
1903                 .fc_nlinfo.nl_net = net,
1904         };
1905
1906         cfg.fc_dst = *prefix;
1907         cfg.fc_gateway = *gwaddr;
1908
1909         /* We should treat it as a default route if prefix length is 0. */
1910         if (!prefixlen)
1911                 cfg.fc_flags |= RTF_DEFAULT;
1912
1913         ip6_route_add(&cfg);
1914
1915         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1916 }
1917 #endif
1918
1919 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1920 {
1921         struct rt6_info *rt;
1922         struct fib6_table *table;
1923
1924         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1925         if (!table)
1926                 return NULL;
1927
1928         write_lock_bh(&table->tb6_lock);
1929         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1930                 if (dev == rt->dst.dev &&
1931                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1932                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1933                         break;
1934         }
1935         if (rt)
1936                 dst_hold(&rt->dst);
1937         write_unlock_bh(&table->tb6_lock);
1938         return rt;
1939 }
1940
1941 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1942                                      struct net_device *dev,
1943                                      unsigned int pref)
1944 {
1945         struct fib6_config cfg = {
1946                 .fc_table       = RT6_TABLE_DFLT,
1947                 .fc_metric      = IP6_RT_PRIO_USER,
1948                 .fc_ifindex     = dev->ifindex,
1949                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1950                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1951                 .fc_nlinfo.pid = 0,
1952                 .fc_nlinfo.nlh = NULL,
1953                 .fc_nlinfo.nl_net = dev_net(dev),
1954         };
1955
1956         cfg.fc_gateway = *gwaddr;
1957
1958         ip6_route_add(&cfg);
1959
1960         return rt6_get_dflt_router(gwaddr, dev);
1961 }
1962
1963 void rt6_purge_dflt_routers(struct net *net)
1964 {
1965         struct rt6_info *rt;
1966         struct fib6_table *table;
1967
1968         /* NOTE: Keep consistent with rt6_get_dflt_router */
1969         table = fib6_get_table(net, RT6_TABLE_DFLT);
1970         if (!table)
1971                 return;
1972
1973 restart:
1974         read_lock_bh(&table->tb6_lock);
1975         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1976                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1977                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
1978                         dst_hold(&rt->dst);
1979                         read_unlock_bh(&table->tb6_lock);
1980                         ip6_del_rt(rt);
1981                         goto restart;
1982                 }
1983         }
1984         read_unlock_bh(&table->tb6_lock);
1985 }
1986
1987 static void rtmsg_to_fib6_config(struct net *net,
1988                                  struct in6_rtmsg *rtmsg,
1989                                  struct fib6_config *cfg)
1990 {
1991         memset(cfg, 0, sizeof(*cfg));
1992
1993         cfg->fc_table = RT6_TABLE_MAIN;
1994         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1995         cfg->fc_metric = rtmsg->rtmsg_metric;
1996         cfg->fc_expires = rtmsg->rtmsg_info;
1997         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1998         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1999         cfg->fc_flags = rtmsg->rtmsg_flags;
2000
2001         cfg->fc_nlinfo.nl_net = net;
2002
2003         cfg->fc_dst = rtmsg->rtmsg_dst;
2004         cfg->fc_src = rtmsg->rtmsg_src;
2005         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2006 }
2007
2008 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2009 {
2010         struct fib6_config cfg;
2011         struct in6_rtmsg rtmsg;
2012         int err;
2013
2014         switch(cmd) {
2015         case SIOCADDRT:         /* Add a route */
2016         case SIOCDELRT:         /* Delete a route */
2017                 if (!capable(CAP_NET_ADMIN))
2018                         return -EPERM;
2019                 err = copy_from_user(&rtmsg, arg,
2020                                      sizeof(struct in6_rtmsg));
2021                 if (err)
2022                         return -EFAULT;
2023
2024                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2025
2026                 rtnl_lock();
2027                 switch (cmd) {
2028                 case SIOCADDRT:
2029                         err = ip6_route_add(&cfg);
2030                         break;
2031                 case SIOCDELRT:
2032                         err = ip6_route_del(&cfg);
2033                         break;
2034                 default:
2035                         err = -EINVAL;
2036                 }
2037                 rtnl_unlock();
2038
2039                 return err;
2040         }
2041
2042         return -EINVAL;
2043 }
2044
2045 /*
2046  *      Drop the packet on the floor
2047  */
2048
2049 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2050 {
2051         int type;
2052         struct dst_entry *dst = skb_dst(skb);
2053         switch (ipstats_mib_noroutes) {
2054         case IPSTATS_MIB_INNOROUTES:
2055                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2056                 if (type == IPV6_ADDR_ANY) {
2057                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2058                                       IPSTATS_MIB_INADDRERRORS);
2059                         break;
2060                 }
2061                 /* FALLTHROUGH */
2062         case IPSTATS_MIB_OUTNOROUTES:
2063                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2064                               ipstats_mib_noroutes);
2065                 break;
2066         }
2067         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2068         kfree_skb(skb);
2069         return 0;
2070 }
2071
2072 static int ip6_pkt_discard(struct sk_buff *skb)
2073 {
2074         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2075 }
2076
2077 static int ip6_pkt_discard_out(struct sk_buff *skb)
2078 {
2079         skb->dev = skb_dst(skb)->dev;
2080         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2081 }
2082
2083 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2084
2085 static int ip6_pkt_prohibit(struct sk_buff *skb)
2086 {
2087         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2088 }
2089
2090 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2091 {
2092         skb->dev = skb_dst(skb)->dev;
2093         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2094 }
2095
2096 #endif
2097
2098 /*
2099  *      Allocate a dst for local (unicast / anycast) address.
2100  */
2101
2102 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2103                                     const struct in6_addr *addr,
2104                                     bool anycast)
2105 {
2106         struct net *net = dev_net(idev->dev);
2107         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2108                                             net->loopback_dev, 0);
2109         int err;
2110
2111         if (!rt) {
2112                 if (net_ratelimit())
2113                         pr_warning("IPv6:  Maximum number of routes reached,"
2114                                    " consider increasing route/max_size.\n");
2115                 return ERR_PTR(-ENOMEM);
2116         }
2117
2118         in6_dev_hold(idev);
2119
2120         rt->dst.flags |= DST_HOST;
2121         rt->dst.input = ip6_input;
2122         rt->dst.output = ip6_output;
2123         rt->rt6i_idev = idev;
2124         rt->dst.obsolete = -1;
2125
2126         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2127         if (anycast)
2128                 rt->rt6i_flags |= RTF_ANYCAST;
2129         else
2130                 rt->rt6i_flags |= RTF_LOCAL;
2131         err = rt6_bind_neighbour(rt, rt->dst.dev);
2132         if (err) {
2133                 dst_free(&rt->dst);
2134                 return ERR_PTR(err);
2135         }
2136
2137         rt->rt6i_dst.addr = *addr;
2138         rt->rt6i_dst.plen = 128;
2139         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2140
2141         atomic_set(&rt->dst.__refcnt, 1);
2142
2143         return rt;
2144 }
2145
2146 int ip6_route_get_saddr(struct net *net,
2147                         struct rt6_info *rt,
2148                         const struct in6_addr *daddr,
2149                         unsigned int prefs,
2150                         struct in6_addr *saddr)
2151 {
2152         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2153         int err = 0;
2154         if (rt->rt6i_prefsrc.plen)
2155                 *saddr = rt->rt6i_prefsrc.addr;
2156         else
2157                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2158                                          daddr, prefs, saddr);
2159         return err;
2160 }
2161
2162 /* remove deleted ip from prefsrc entries */
2163 struct arg_dev_net_ip {
2164         struct net_device *dev;
2165         struct net *net;
2166         struct in6_addr *addr;
2167 };
2168
2169 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2170 {
2171         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2172         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2173         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2174
2175         if (((void *)rt->dst.dev == dev || !dev) &&
2176             rt != net->ipv6.ip6_null_entry &&
2177             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2178                 /* remove prefsrc entry */
2179                 rt->rt6i_prefsrc.plen = 0;
2180         }
2181         return 0;
2182 }
2183
2184 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2185 {
2186         struct net *net = dev_net(ifp->idev->dev);
2187         struct arg_dev_net_ip adni = {
2188                 .dev = ifp->idev->dev,
2189                 .net = net,
2190                 .addr = &ifp->addr,
2191         };
2192         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2193 }
2194
2195 struct arg_dev_net {
2196         struct net_device *dev;
2197         struct net *net;
2198 };
2199
2200 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2201 {
2202         const struct arg_dev_net *adn = arg;
2203         const struct net_device *dev = adn->dev;
2204
2205         if ((rt->dst.dev == dev || !dev) &&
2206             rt != adn->net->ipv6.ip6_null_entry)
2207                 return -1;
2208
2209         return 0;
2210 }
2211
2212 void rt6_ifdown(struct net *net, struct net_device *dev)
2213 {
2214         struct arg_dev_net adn = {
2215                 .dev = dev,
2216                 .net = net,
2217         };
2218
2219         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2220         icmp6_clean_all(fib6_ifdown, &adn);
2221 }
2222
2223 struct rt6_mtu_change_arg
2224 {
2225         struct net_device *dev;
2226         unsigned mtu;
2227 };
2228
2229 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2230 {
2231         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2232         struct inet6_dev *idev;
2233
2234         /* In IPv6 pmtu discovery is not optional,
2235            so that RTAX_MTU lock cannot disable it.
2236            We still use this lock to block changes
2237            caused by addrconf/ndisc.
2238         */
2239
2240         idev = __in6_dev_get(arg->dev);
2241         if (!idev)
2242                 return 0;
2243
2244         /* For administrative MTU increase, there is no way to discover
2245            IPv6 PMTU increase, so PMTU increase should be updated here.
2246            Since RFC 1981 doesn't include administrative MTU increase
2247            update PMTU increase is a MUST. (i.e. jumbo frame)
2248          */
2249         /*
2250            If new MTU is less than route PMTU, this new MTU will be the
2251            lowest MTU in the path, update the route PMTU to reflect PMTU
2252            decreases; if new MTU is greater than route PMTU, and the
2253            old MTU is the lowest MTU in the path, update the route PMTU
2254            to reflect the increase. In this case if the other nodes' MTU
2255            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2256            PMTU discouvery.
2257          */
2258         if (rt->dst.dev == arg->dev &&
2259             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2260             (dst_mtu(&rt->dst) >= arg->mtu ||
2261              (dst_mtu(&rt->dst) < arg->mtu &&
2262               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2263                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2264         }
2265         return 0;
2266 }
2267
2268 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2269 {
2270         struct rt6_mtu_change_arg arg = {
2271                 .dev = dev,
2272                 .mtu = mtu,
2273         };
2274
2275         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2276 }
2277
2278 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2279         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2280         [RTA_OIF]               = { .type = NLA_U32 },
2281         [RTA_IIF]               = { .type = NLA_U32 },
2282         [RTA_PRIORITY]          = { .type = NLA_U32 },
2283         [RTA_METRICS]           = { .type = NLA_NESTED },
2284 };
2285
2286 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2287                               struct fib6_config *cfg)
2288 {
2289         struct rtmsg *rtm;
2290         struct nlattr *tb[RTA_MAX+1];
2291         int err;
2292
2293         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2294         if (err < 0)
2295                 goto errout;
2296
2297         err = -EINVAL;
2298         rtm = nlmsg_data(nlh);
2299         memset(cfg, 0, sizeof(*cfg));
2300
2301         cfg->fc_table = rtm->rtm_table;
2302         cfg->fc_dst_len = rtm->rtm_dst_len;
2303         cfg->fc_src_len = rtm->rtm_src_len;
2304         cfg->fc_flags = RTF_UP;
2305         cfg->fc_protocol = rtm->rtm_protocol;
2306
2307         if (rtm->rtm_type == RTN_UNREACHABLE)
2308                 cfg->fc_flags |= RTF_REJECT;
2309
2310         if (rtm->rtm_type == RTN_LOCAL)
2311                 cfg->fc_flags |= RTF_LOCAL;
2312
2313         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2314         cfg->fc_nlinfo.nlh = nlh;
2315         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2316
2317         if (tb[RTA_GATEWAY]) {
2318                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2319                 cfg->fc_flags |= RTF_GATEWAY;
2320         }
2321
2322         if (tb[RTA_DST]) {
2323                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2324
2325                 if (nla_len(tb[RTA_DST]) < plen)
2326                         goto errout;
2327
2328                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2329         }
2330
2331         if (tb[RTA_SRC]) {
2332                 int plen = (rtm->rtm_src_len + 7) >> 3;
2333
2334                 if (nla_len(tb[RTA_SRC]) < plen)
2335                         goto errout;
2336
2337                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2338         }
2339
2340         if (tb[RTA_PREFSRC])
2341                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2342
2343         if (tb[RTA_OIF])
2344                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2345
2346         if (tb[RTA_PRIORITY])
2347                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2348
2349         if (tb[RTA_METRICS]) {
2350                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2351                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2352         }
2353
2354         if (tb[RTA_TABLE])
2355                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2356
2357         err = 0;
2358 errout:
2359         return err;
2360 }
2361
2362 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2363 {
2364         struct fib6_config cfg;
2365         int err;
2366
2367         err = rtm_to_fib6_config(skb, nlh, &cfg);
2368         if (err < 0)
2369                 return err;
2370
2371         return ip6_route_del(&cfg);
2372 }
2373
2374 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2375 {
2376         struct fib6_config cfg;
2377         int err;
2378
2379         err = rtm_to_fib6_config(skb, nlh, &cfg);
2380         if (err < 0)
2381                 return err;
2382
2383         return ip6_route_add(&cfg);
2384 }
2385
2386 static inline size_t rt6_nlmsg_size(void)
2387 {
2388         return NLMSG_ALIGN(sizeof(struct rtmsg))
2389                + nla_total_size(16) /* RTA_SRC */
2390                + nla_total_size(16) /* RTA_DST */
2391                + nla_total_size(16) /* RTA_GATEWAY */
2392                + nla_total_size(16) /* RTA_PREFSRC */
2393                + nla_total_size(4) /* RTA_TABLE */
2394                + nla_total_size(4) /* RTA_IIF */
2395                + nla_total_size(4) /* RTA_OIF */
2396                + nla_total_size(4) /* RTA_PRIORITY */
2397                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2398                + nla_total_size(sizeof(struct rta_cacheinfo));
2399 }
2400
2401 static int rt6_fill_node(struct net *net,
2402                          struct sk_buff *skb, struct rt6_info *rt,
2403                          struct in6_addr *dst, struct in6_addr *src,
2404                          int iif, int type, u32 pid, u32 seq,
2405                          int prefix, int nowait, unsigned int flags)
2406 {
2407         const struct inet_peer *peer;
2408         struct rtmsg *rtm;
2409         struct nlmsghdr *nlh;
2410         long expires;
2411         u32 table;
2412         struct neighbour *n;
2413         u32 ts, tsage;
2414
2415         if (prefix) {   /* user wants prefix routes only */
2416                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2417                         /* success since this is not a prefix route */
2418                         return 1;
2419                 }
2420         }
2421
2422         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2423         if (!nlh)
2424                 return -EMSGSIZE;
2425
2426         rtm = nlmsg_data(nlh);
2427         rtm->rtm_family = AF_INET6;
2428         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2429         rtm->rtm_src_len = rt->rt6i_src.plen;
2430         rtm->rtm_tos = 0;
2431         if (rt->rt6i_table)
2432                 table = rt->rt6i_table->tb6_id;
2433         else
2434                 table = RT6_TABLE_UNSPEC;
2435         rtm->rtm_table = table;
2436         NLA_PUT_U32(skb, RTA_TABLE, table);
2437         if (rt->rt6i_flags & RTF_REJECT)
2438                 rtm->rtm_type = RTN_UNREACHABLE;
2439         else if (rt->rt6i_flags & RTF_LOCAL)
2440                 rtm->rtm_type = RTN_LOCAL;
2441         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2442                 rtm->rtm_type = RTN_LOCAL;
2443         else
2444                 rtm->rtm_type = RTN_UNICAST;
2445         rtm->rtm_flags = 0;
2446         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2447         rtm->rtm_protocol = rt->rt6i_protocol;
2448         if (rt->rt6i_flags & RTF_DYNAMIC)
2449                 rtm->rtm_protocol = RTPROT_REDIRECT;
2450         else if (rt->rt6i_flags & RTF_ADDRCONF)
2451                 rtm->rtm_protocol = RTPROT_KERNEL;
2452         else if (rt->rt6i_flags & RTF_DEFAULT)
2453                 rtm->rtm_protocol = RTPROT_RA;
2454
2455         if (rt->rt6i_flags & RTF_CACHE)
2456                 rtm->rtm_flags |= RTM_F_CLONED;
2457
2458         if (dst) {
2459                 NLA_PUT(skb, RTA_DST, 16, dst);
2460                 rtm->rtm_dst_len = 128;
2461         } else if (rtm->rtm_dst_len)
2462                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2463 #ifdef CONFIG_IPV6_SUBTREES
2464         if (src) {
2465                 NLA_PUT(skb, RTA_SRC, 16, src);
2466                 rtm->rtm_src_len = 128;
2467         } else if (rtm->rtm_src_len)
2468                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2469 #endif
2470         if (iif) {
2471 #ifdef CONFIG_IPV6_MROUTE
2472                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2473                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2474                         if (err <= 0) {
2475                                 if (!nowait) {
2476                                         if (err == 0)
2477                                                 return 0;
2478                                         goto nla_put_failure;
2479                                 } else {
2480                                         if (err == -EMSGSIZE)
2481                                                 goto nla_put_failure;
2482                                 }
2483                         }
2484                 } else
2485 #endif
2486                         NLA_PUT_U32(skb, RTA_IIF, iif);
2487         } else if (dst) {
2488                 struct in6_addr saddr_buf;
2489                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2490                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2491         }
2492
2493         if (rt->rt6i_prefsrc.plen) {
2494                 struct in6_addr saddr_buf;
2495                 saddr_buf = rt->rt6i_prefsrc.addr;
2496                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2497         }
2498
2499         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2500                 goto nla_put_failure;
2501
2502         rcu_read_lock();
2503         n = dst_get_neighbour_noref(&rt->dst);
2504         if (n) {
2505                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2506                         rcu_read_unlock();
2507                         goto nla_put_failure;
2508                 }
2509         }
2510         rcu_read_unlock();
2511
2512         if (rt->dst.dev)
2513                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2514
2515         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2516
2517         if (!(rt->rt6i_flags & RTF_EXPIRES))
2518                 expires = 0;
2519         else if (rt->dst.expires - jiffies < INT_MAX)
2520                 expires = rt->dst.expires - jiffies;
2521         else
2522                 expires = INT_MAX;
2523
2524         peer = rt->rt6i_peer;
2525         ts = tsage = 0;
2526         if (peer && peer->tcp_ts_stamp) {
2527                 ts = peer->tcp_ts;
2528                 tsage = get_seconds() - peer->tcp_ts_stamp;
2529         }
2530
2531         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2532                                expires, rt->dst.error) < 0)
2533                 goto nla_put_failure;
2534
2535         return nlmsg_end(skb, nlh);
2536
2537 nla_put_failure:
2538         nlmsg_cancel(skb, nlh);
2539         return -EMSGSIZE;
2540 }
2541
2542 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2543 {
2544         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2545         int prefix;
2546
2547         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2548                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2549                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2550         } else
2551                 prefix = 0;
2552
2553         return rt6_fill_node(arg->net,
2554                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2555                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2556                      prefix, 0, NLM_F_MULTI);
2557 }
2558
2559 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2560 {
2561         struct net *net = sock_net(in_skb->sk);
2562         struct nlattr *tb[RTA_MAX+1];
2563         struct rt6_info *rt;
2564         struct sk_buff *skb;
2565         struct rtmsg *rtm;
2566         struct flowi6 fl6;
2567         int err, iif = 0, oif = 0;
2568
2569         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2570         if (err < 0)
2571                 goto errout;
2572
2573         err = -EINVAL;
2574         memset(&fl6, 0, sizeof(fl6));
2575
2576         if (tb[RTA_SRC]) {
2577                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2578                         goto errout;
2579
2580                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2581         }
2582
2583         if (tb[RTA_DST]) {
2584                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2585                         goto errout;
2586
2587                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2588         }
2589
2590         if (tb[RTA_IIF])
2591                 iif = nla_get_u32(tb[RTA_IIF]);
2592
2593         if (tb[RTA_OIF])
2594                 oif = nla_get_u32(tb[RTA_OIF]);
2595
2596         if (iif) {
2597                 struct net_device *dev;
2598                 int flags = 0;
2599
2600                 dev = __dev_get_by_index(net, iif);
2601                 if (!dev) {
2602                         err = -ENODEV;
2603                         goto errout;
2604                 }
2605
2606                 fl6.flowi6_iif = iif;
2607
2608                 if (!ipv6_addr_any(&fl6.saddr))
2609                         flags |= RT6_LOOKUP_F_HAS_SADDR;
2610
2611                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2612                                                                flags);
2613         } else {
2614                 fl6.flowi6_oif = oif;
2615
2616                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2617         }
2618
2619         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2620         if (!skb) {
2621                 err = -ENOBUFS;
2622                 goto errout;
2623         }
2624
2625         /* Reserve room for dummy headers, this skb can pass
2626            through good chunk of routing engine.
2627          */
2628         skb_reset_mac_header(skb);
2629         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2630
2631         skb_dst_set(skb, &rt->dst);
2632
2633         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2634                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2635                             nlh->nlmsg_seq, 0, 0, 0);
2636         if (err < 0) {
2637                 kfree_skb(skb);
2638                 goto errout;
2639         }
2640
2641         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2642 errout:
2643         return err;
2644 }
2645
2646 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2647 {
2648         struct sk_buff *skb;
2649         struct net *net = info->nl_net;
2650         u32 seq;
2651         int err;
2652
2653         err = -ENOBUFS;
2654         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2655
2656         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2657         if (!skb)
2658                 goto errout;
2659
2660         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2661                                 event, info->pid, seq, 0, 0, 0);
2662         if (err < 0) {
2663                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2664                 WARN_ON(err == -EMSGSIZE);
2665                 kfree_skb(skb);
2666                 goto errout;
2667         }
2668         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2669                     info->nlh, gfp_any());
2670         return;
2671 errout:
2672         if (err < 0)
2673                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2674 }
2675
2676 static int ip6_route_dev_notify(struct notifier_block *this,
2677                                 unsigned long event, void *data)
2678 {
2679         struct net_device *dev = (struct net_device *)data;
2680         struct net *net = dev_net(dev);
2681
2682         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2683                 net->ipv6.ip6_null_entry->dst.dev = dev;
2684                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2685 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2686                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2687                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2688                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2689                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2690 #endif
2691         }
2692
2693         return NOTIFY_OK;
2694 }
2695
2696 /*
2697  *      /proc
2698  */
2699
2700 #ifdef CONFIG_PROC_FS
2701
2702 struct rt6_proc_arg
2703 {
2704         char *buffer;
2705         int offset;
2706         int length;
2707         int skip;
2708         int len;
2709 };
2710
2711 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2712 {
2713         struct seq_file *m = p_arg;
2714         struct neighbour *n;
2715
2716         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2717
2718 #ifdef CONFIG_IPV6_SUBTREES
2719         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2720 #else
2721         seq_puts(m, "00000000000000000000000000000000 00 ");
2722 #endif
2723         rcu_read_lock();
2724         n = dst_get_neighbour_noref(&rt->dst);
2725         if (n) {
2726                 seq_printf(m, "%pi6", n->primary_key);
2727         } else {
2728                 seq_puts(m, "00000000000000000000000000000000");
2729         }
2730         rcu_read_unlock();
2731         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2732                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2733                    rt->dst.__use, rt->rt6i_flags,
2734                    rt->dst.dev ? rt->dst.dev->name : "");
2735         return 0;
2736 }
2737
2738 static int ipv6_route_show(struct seq_file *m, void *v)
2739 {
2740         struct net *net = (struct net *)m->private;
2741         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2742         return 0;
2743 }
2744
2745 static int ipv6_route_open(struct inode *inode, struct file *file)
2746 {
2747         return single_open_net(inode, file, ipv6_route_show);
2748 }
2749
2750 static const struct file_operations ipv6_route_proc_fops = {
2751         .owner          = THIS_MODULE,
2752         .open           = ipv6_route_open,
2753         .read           = seq_read,
2754         .llseek         = seq_lseek,
2755         .release        = single_release_net,
2756 };
2757
2758 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2759 {
2760         struct net *net = (struct net *)seq->private;
2761         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2762                    net->ipv6.rt6_stats->fib_nodes,
2763                    net->ipv6.rt6_stats->fib_route_nodes,
2764                    net->ipv6.rt6_stats->fib_rt_alloc,
2765                    net->ipv6.rt6_stats->fib_rt_entries,
2766                    net->ipv6.rt6_stats->fib_rt_cache,
2767                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2768                    net->ipv6.rt6_stats->fib_discarded_routes);
2769
2770         return 0;
2771 }
2772
2773 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2774 {
2775         return single_open_net(inode, file, rt6_stats_seq_show);
2776 }
2777
2778 static const struct file_operations rt6_stats_seq_fops = {
2779         .owner   = THIS_MODULE,
2780         .open    = rt6_stats_seq_open,
2781         .read    = seq_read,
2782         .llseek  = seq_lseek,
2783         .release = single_release_net,
2784 };
2785 #endif  /* CONFIG_PROC_FS */
2786
2787 #ifdef CONFIG_SYSCTL
2788
2789 static
2790 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2791                               void __user *buffer, size_t *lenp, loff_t *ppos)
2792 {
2793         struct net *net;
2794         int delay;
2795         if (!write)
2796                 return -EINVAL;
2797
2798         net = (struct net *)ctl->extra1;
2799         delay = net->ipv6.sysctl.flush_delay;
2800         proc_dointvec(ctl, write, buffer, lenp, ppos);
2801         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2802         return 0;
2803 }
2804
2805 ctl_table ipv6_route_table_template[] = {
2806         {
2807                 .procname       =       "flush",
2808                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2809                 .maxlen         =       sizeof(int),
2810                 .mode           =       0200,
2811                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2812         },
2813         {
2814                 .procname       =       "gc_thresh",
2815                 .data           =       &ip6_dst_ops_template.gc_thresh,
2816                 .maxlen         =       sizeof(int),
2817                 .mode           =       0644,
2818                 .proc_handler   =       proc_dointvec,
2819         },
2820         {
2821                 .procname       =       "max_size",
2822                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2823                 .maxlen         =       sizeof(int),
2824                 .mode           =       0644,
2825                 .proc_handler   =       proc_dointvec,
2826         },
2827         {
2828                 .procname       =       "gc_min_interval",
2829                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2830                 .maxlen         =       sizeof(int),
2831                 .mode           =       0644,
2832                 .proc_handler   =       proc_dointvec_jiffies,
2833         },
2834         {
2835                 .procname       =       "gc_timeout",
2836                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2837                 .maxlen         =       sizeof(int),
2838                 .mode           =       0644,
2839                 .proc_handler   =       proc_dointvec_jiffies,
2840         },
2841         {
2842                 .procname       =       "gc_interval",
2843                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2844                 .maxlen         =       sizeof(int),
2845                 .mode           =       0644,
2846                 .proc_handler   =       proc_dointvec_jiffies,
2847         },
2848         {
2849                 .procname       =       "gc_elasticity",
2850                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2851                 .maxlen         =       sizeof(int),
2852                 .mode           =       0644,
2853                 .proc_handler   =       proc_dointvec,
2854         },
2855         {
2856                 .procname       =       "mtu_expires",
2857                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2858                 .maxlen         =       sizeof(int),
2859                 .mode           =       0644,
2860                 .proc_handler   =       proc_dointvec_jiffies,
2861         },
2862         {
2863                 .procname       =       "min_adv_mss",
2864                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2865                 .maxlen         =       sizeof(int),
2866                 .mode           =       0644,
2867                 .proc_handler   =       proc_dointvec,
2868         },
2869         {
2870                 .procname       =       "gc_min_interval_ms",
2871                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2872                 .maxlen         =       sizeof(int),
2873                 .mode           =       0644,
2874                 .proc_handler   =       proc_dointvec_ms_jiffies,
2875         },
2876         { }
2877 };
2878
2879 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2880 {
2881         struct ctl_table *table;
2882
2883         table = kmemdup(ipv6_route_table_template,
2884                         sizeof(ipv6_route_table_template),
2885                         GFP_KERNEL);
2886
2887         if (table) {
2888                 table[0].data = &net->ipv6.sysctl.flush_delay;
2889                 table[0].extra1 = net;
2890                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2891                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2892                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2893                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2894                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2895                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2896                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2897                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2898                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2899         }
2900
2901         return table;
2902 }
2903 #endif
2904
2905 static int __net_init ip6_route_net_init(struct net *net)
2906 {
2907         int ret = -ENOMEM;
2908
2909         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2910                sizeof(net->ipv6.ip6_dst_ops));
2911
2912         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2913                 goto out_ip6_dst_ops;
2914
2915         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2916                                            sizeof(*net->ipv6.ip6_null_entry),
2917                                            GFP_KERNEL);
2918         if (!net->ipv6.ip6_null_entry)
2919                 goto out_ip6_dst_entries;
2920         net->ipv6.ip6_null_entry->dst.path =
2921                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2922         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2923         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2924                          ip6_template_metrics, true);
2925
2926 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2927         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2928                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2929                                                GFP_KERNEL);
2930         if (!net->ipv6.ip6_prohibit_entry)
2931                 goto out_ip6_null_entry;
2932         net->ipv6.ip6_prohibit_entry->dst.path =
2933                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2934         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2935         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2936                          ip6_template_metrics, true);
2937
2938         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2939                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2940                                                GFP_KERNEL);
2941         if (!net->ipv6.ip6_blk_hole_entry)
2942                 goto out_ip6_prohibit_entry;
2943         net->ipv6.ip6_blk_hole_entry->dst.path =
2944                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2945         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2946         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2947                          ip6_template_metrics, true);
2948 #endif
2949
2950         net->ipv6.sysctl.flush_delay = 0;
2951         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2952         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2953         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2954         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2955         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2956         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2957         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2958
2959         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2960
2961         ret = 0;
2962 out:
2963         return ret;
2964
2965 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2966 out_ip6_prohibit_entry:
2967         kfree(net->ipv6.ip6_prohibit_entry);
2968 out_ip6_null_entry:
2969         kfree(net->ipv6.ip6_null_entry);
2970 #endif
2971 out_ip6_dst_entries:
2972         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2973 out_ip6_dst_ops:
2974         goto out;
2975 }
2976
2977 static void __net_exit ip6_route_net_exit(struct net *net)
2978 {
2979         kfree(net->ipv6.ip6_null_entry);
2980 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2981         kfree(net->ipv6.ip6_prohibit_entry);
2982         kfree(net->ipv6.ip6_blk_hole_entry);
2983 #endif
2984         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2985 }
2986
2987 static int __net_init ip6_route_net_init_late(struct net *net)
2988 {
2989 #ifdef CONFIG_PROC_FS
2990         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2991         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2992 #endif
2993         return 0;
2994 }
2995
2996 static void __net_exit ip6_route_net_exit_late(struct net *net)
2997 {
2998 #ifdef CONFIG_PROC_FS
2999         proc_net_remove(net, "ipv6_route");
3000         proc_net_remove(net, "rt6_stats");
3001 #endif
3002 }
3003
3004 static struct pernet_operations ip6_route_net_ops = {
3005         .init = ip6_route_net_init,
3006         .exit = ip6_route_net_exit,
3007 };
3008
3009 static struct pernet_operations ip6_route_net_late_ops = {
3010         .init = ip6_route_net_init_late,
3011         .exit = ip6_route_net_exit_late,
3012 };
3013
3014 static struct notifier_block ip6_route_dev_notifier = {
3015         .notifier_call = ip6_route_dev_notify,
3016         .priority = 0,
3017 };
3018
3019 int __init ip6_route_init(void)
3020 {
3021         int ret;
3022
3023         ret = -ENOMEM;
3024         ip6_dst_ops_template.kmem_cachep =
3025                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3026                                   SLAB_HWCACHE_ALIGN, NULL);
3027         if (!ip6_dst_ops_template.kmem_cachep)
3028                 goto out;
3029
3030         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3031         if (ret)
3032                 goto out_kmem_cache;
3033
3034         ret = register_pernet_subsys(&ip6_route_net_ops);
3035         if (ret)
3036                 goto out_dst_entries;
3037
3038         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3039
3040         /* Registering of the loopback is done before this portion of code,
3041          * the loopback reference in rt6_info will not be taken, do it
3042          * manually for init_net */
3043         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3044         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3045   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3046         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3047         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3048         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3049         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3050   #endif
3051         ret = fib6_init();
3052         if (ret)
3053                 goto out_register_subsys;
3054
3055         ret = xfrm6_init();
3056         if (ret)
3057                 goto out_fib6_init;
3058
3059         ret = fib6_rules_init();
3060         if (ret)
3061                 goto xfrm6_init;
3062
3063         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3064         if (ret)
3065                 goto fib6_rules_init;
3066
3067         ret = -ENOBUFS;
3068         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3069             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3070             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3071                 goto out_register_late_subsys;
3072
3073         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3074         if (ret)
3075                 goto out_register_late_subsys;
3076
3077 out:
3078         return ret;
3079
3080 out_register_late_subsys:
3081         unregister_pernet_subsys(&ip6_route_net_late_ops);
3082 fib6_rules_init:
3083         fib6_rules_cleanup();
3084 xfrm6_init:
3085         xfrm6_fini();
3086 out_fib6_init:
3087         fib6_gc_cleanup();
3088 out_register_subsys:
3089         unregister_pernet_subsys(&ip6_route_net_ops);
3090 out_dst_entries:
3091         dst_entries_destroy(&ip6_dst_blackhole_ops);
3092 out_kmem_cache:
3093         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3094         goto out;
3095 }
3096
3097 void ip6_route_cleanup(void)
3098 {
3099         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3100         unregister_pernet_subsys(&ip6_route_net_late_ops);
3101         fib6_rules_cleanup();
3102         xfrm6_fini();
3103         fib6_gc_cleanup();
3104         unregister_pernet_subsys(&ip6_route_net_ops);
3105         dst_entries_destroy(&ip6_dst_blackhole_ops);
3106         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3107 }