]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv6/route.c
ipv6: Set default hoplimit as zero.
[karo-tx-linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
66                                     const struct in6_addr *dest);
67 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
68 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
69 static unsigned int      ip6_mtu(const struct dst_entry *dst);
70 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
71 static void             ip6_dst_destroy(struct dst_entry *);
72 static void             ip6_dst_ifdown(struct dst_entry *,
73                                        struct net_device *dev, int how);
74 static int               ip6_dst_gc(struct dst_ops *ops);
75
76 static int              ip6_pkt_discard(struct sk_buff *skb);
77 static int              ip6_pkt_discard_out(struct sk_buff *skb);
78 static void             ip6_link_failure(struct sk_buff *skb);
79 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
80
81 #ifdef CONFIG_IPV6_ROUTE_INFO
82 static struct rt6_info *rt6_add_route_info(struct net *net,
83                                            const struct in6_addr *prefix, int prefixlen,
84                                            const struct in6_addr *gwaddr, int ifindex,
85                                            unsigned pref);
86 static struct rt6_info *rt6_get_route_info(struct net *net,
87                                            const struct in6_addr *prefix, int prefixlen,
88                                            const struct in6_addr *gwaddr, int ifindex);
89 #endif
90
91 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
92 {
93         struct rt6_info *rt = (struct rt6_info *) dst;
94         struct inet_peer *peer;
95         u32 *p = NULL;
96
97         if (!(rt->dst.flags & DST_HOST))
98                 return NULL;
99
100         if (!rt->rt6i_peer)
101                 rt6_bind_peer(rt, 1);
102
103         peer = rt->rt6i_peer;
104         if (peer) {
105                 u32 *old_p = __DST_METRICS_PTR(old);
106                 unsigned long prev, new;
107
108                 p = peer->metrics;
109                 if (inet_metrics_new(peer))
110                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
111
112                 new = (unsigned long) p;
113                 prev = cmpxchg(&dst->_metrics, old, new);
114
115                 if (prev != old) {
116                         p = __DST_METRICS_PTR(prev);
117                         if (prev & DST_METRICS_READ_ONLY)
118                                 p = NULL;
119                 }
120         }
121         return p;
122 }
123
124 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
125 {
126         struct in6_addr *p = &rt->rt6i_gateway;
127
128         if (!ipv6_addr_any(p))
129                 return (const void *) p;
130         return daddr;
131 }
132
133 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
134 {
135         struct rt6_info *rt = (struct rt6_info *) dst;
136         struct neighbour *n;
137
138         daddr = choose_neigh_daddr(rt, daddr);
139         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
140         if (n)
141                 return n;
142         return neigh_create(&nd_tbl, daddr, dst->dev);
143 }
144
145 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
146 {
147         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
148         if (!n) {
149                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
150                 if (IS_ERR(n))
151                         return PTR_ERR(n);
152         }
153         dst_set_neighbour(&rt->dst, n);
154
155         return 0;
156 }
157
158 static struct dst_ops ip6_dst_ops_template = {
159         .family                 =       AF_INET6,
160         .protocol               =       cpu_to_be16(ETH_P_IPV6),
161         .gc                     =       ip6_dst_gc,
162         .gc_thresh              =       1024,
163         .check                  =       ip6_dst_check,
164         .default_advmss         =       ip6_default_advmss,
165         .mtu                    =       ip6_mtu,
166         .cow_metrics            =       ipv6_cow_metrics,
167         .destroy                =       ip6_dst_destroy,
168         .ifdown                 =       ip6_dst_ifdown,
169         .negative_advice        =       ip6_negative_advice,
170         .link_failure           =       ip6_link_failure,
171         .update_pmtu            =       ip6_rt_update_pmtu,
172         .local_out              =       __ip6_local_out,
173         .neigh_lookup           =       ip6_neigh_lookup,
174 };
175
176 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
177 {
178         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
179
180         return mtu ? : dst->dev->mtu;
181 }
182
183 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
184 {
185 }
186
187 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
188                                          unsigned long old)
189 {
190         return NULL;
191 }
192
193 static struct dst_ops ip6_dst_blackhole_ops = {
194         .family                 =       AF_INET6,
195         .protocol               =       cpu_to_be16(ETH_P_IPV6),
196         .destroy                =       ip6_dst_destroy,
197         .check                  =       ip6_dst_check,
198         .mtu                    =       ip6_blackhole_mtu,
199         .default_advmss         =       ip6_default_advmss,
200         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
201         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
202         .neigh_lookup           =       ip6_neigh_lookup,
203 };
204
205 static const u32 ip6_template_metrics[RTAX_MAX] = {
206         [RTAX_HOPLIMIT - 1] = 0,
207 };
208
209 static struct rt6_info ip6_null_entry_template = {
210         .dst = {
211                 .__refcnt       = ATOMIC_INIT(1),
212                 .__use          = 1,
213                 .obsolete       = -1,
214                 .error          = -ENETUNREACH,
215                 .input          = ip6_pkt_discard,
216                 .output         = ip6_pkt_discard_out,
217         },
218         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
219         .rt6i_protocol  = RTPROT_KERNEL,
220         .rt6i_metric    = ~(u32) 0,
221         .rt6i_ref       = ATOMIC_INIT(1),
222 };
223
224 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
225
226 static int ip6_pkt_prohibit(struct sk_buff *skb);
227 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
228
229 static struct rt6_info ip6_prohibit_entry_template = {
230         .dst = {
231                 .__refcnt       = ATOMIC_INIT(1),
232                 .__use          = 1,
233                 .obsolete       = -1,
234                 .error          = -EACCES,
235                 .input          = ip6_pkt_prohibit,
236                 .output         = ip6_pkt_prohibit_out,
237         },
238         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
239         .rt6i_protocol  = RTPROT_KERNEL,
240         .rt6i_metric    = ~(u32) 0,
241         .rt6i_ref       = ATOMIC_INIT(1),
242 };
243
244 static struct rt6_info ip6_blk_hole_entry_template = {
245         .dst = {
246                 .__refcnt       = ATOMIC_INIT(1),
247                 .__use          = 1,
248                 .obsolete       = -1,
249                 .error          = -EINVAL,
250                 .input          = dst_discard,
251                 .output         = dst_discard,
252         },
253         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
254         .rt6i_protocol  = RTPROT_KERNEL,
255         .rt6i_metric    = ~(u32) 0,
256         .rt6i_ref       = ATOMIC_INIT(1),
257 };
258
259 #endif
260
261 /* allocate dst with ip6_dst_ops */
262 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
263                                              struct net_device *dev,
264                                              int flags)
265 {
266         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
267
268         if (rt)
269                 memset(&rt->rt6i_table, 0,
270                        sizeof(*rt) - sizeof(struct dst_entry));
271
272         return rt;
273 }
274
275 static void ip6_dst_destroy(struct dst_entry *dst)
276 {
277         struct rt6_info *rt = (struct rt6_info *)dst;
278         struct inet6_dev *idev = rt->rt6i_idev;
279         struct inet_peer *peer = rt->rt6i_peer;
280
281         if (!(rt->dst.flags & DST_HOST))
282                 dst_destroy_metrics_generic(dst);
283
284         if (idev) {
285                 rt->rt6i_idev = NULL;
286                 in6_dev_put(idev);
287         }
288
289         if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
290                 dst_release(dst->from);
291
292         if (peer) {
293                 rt->rt6i_peer = NULL;
294                 inet_putpeer(peer);
295         }
296 }
297
298 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
299
300 static u32 rt6_peer_genid(void)
301 {
302         return atomic_read(&__rt6_peer_genid);
303 }
304
305 void rt6_bind_peer(struct rt6_info *rt, int create)
306 {
307         struct inet_peer *peer;
308
309         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
310         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
311                 inet_putpeer(peer);
312         else
313                 rt->rt6i_peer_genid = rt6_peer_genid();
314 }
315
316 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
317                            int how)
318 {
319         struct rt6_info *rt = (struct rt6_info *)dst;
320         struct inet6_dev *idev = rt->rt6i_idev;
321         struct net_device *loopback_dev =
322                 dev_net(dev)->loopback_dev;
323
324         if (dev != loopback_dev && idev && idev->dev == dev) {
325                 struct inet6_dev *loopback_idev =
326                         in6_dev_get(loopback_dev);
327                 if (loopback_idev) {
328                         rt->rt6i_idev = loopback_idev;
329                         in6_dev_put(idev);
330                 }
331         }
332 }
333
334 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
335 {
336         struct rt6_info *ort = NULL;
337
338         if (rt->rt6i_flags & RTF_EXPIRES) {
339                 if (time_after(jiffies, rt->dst.expires))
340                         return 1;
341         } else if (rt->dst.from) {
342                 ort = (struct rt6_info *) rt->dst.from;
343                 return (ort->rt6i_flags & RTF_EXPIRES) &&
344                         time_after(jiffies, ort->dst.expires);
345         }
346         return 0;
347 }
348
349 static inline int rt6_need_strict(const struct in6_addr *daddr)
350 {
351         return ipv6_addr_type(daddr) &
352                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
353 }
354
355 /*
356  *      Route lookup. Any table->tb6_lock is implied.
357  */
358
359 static inline struct rt6_info *rt6_device_match(struct net *net,
360                                                     struct rt6_info *rt,
361                                                     const struct in6_addr *saddr,
362                                                     int oif,
363                                                     int flags)
364 {
365         struct rt6_info *local = NULL;
366         struct rt6_info *sprt;
367
368         if (!oif && ipv6_addr_any(saddr))
369                 goto out;
370
371         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
372                 struct net_device *dev = sprt->dst.dev;
373
374                 if (oif) {
375                         if (dev->ifindex == oif)
376                                 return sprt;
377                         if (dev->flags & IFF_LOOPBACK) {
378                                 if (!sprt->rt6i_idev ||
379                                     sprt->rt6i_idev->dev->ifindex != oif) {
380                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
381                                                 continue;
382                                         if (local && (!oif ||
383                                                       local->rt6i_idev->dev->ifindex == oif))
384                                                 continue;
385                                 }
386                                 local = sprt;
387                         }
388                 } else {
389                         if (ipv6_chk_addr(net, saddr, dev,
390                                           flags & RT6_LOOKUP_F_IFACE))
391                                 return sprt;
392                 }
393         }
394
395         if (oif) {
396                 if (local)
397                         return local;
398
399                 if (flags & RT6_LOOKUP_F_IFACE)
400                         return net->ipv6.ip6_null_entry;
401         }
402 out:
403         return rt;
404 }
405
406 #ifdef CONFIG_IPV6_ROUTER_PREF
407 static void rt6_probe(struct rt6_info *rt)
408 {
409         struct neighbour *neigh;
410         /*
411          * Okay, this does not seem to be appropriate
412          * for now, however, we need to check if it
413          * is really so; aka Router Reachability Probing.
414          *
415          * Router Reachability Probe MUST be rate-limited
416          * to no more than one per minute.
417          */
418         rcu_read_lock();
419         neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
420         if (!neigh || (neigh->nud_state & NUD_VALID))
421                 goto out;
422         read_lock_bh(&neigh->lock);
423         if (!(neigh->nud_state & NUD_VALID) &&
424             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
425                 struct in6_addr mcaddr;
426                 struct in6_addr *target;
427
428                 neigh->updated = jiffies;
429                 read_unlock_bh(&neigh->lock);
430
431                 target = (struct in6_addr *)&neigh->primary_key;
432                 addrconf_addr_solict_mult(target, &mcaddr);
433                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
434         } else {
435                 read_unlock_bh(&neigh->lock);
436         }
437 out:
438         rcu_read_unlock();
439 }
440 #else
441 static inline void rt6_probe(struct rt6_info *rt)
442 {
443 }
444 #endif
445
446 /*
447  * Default Router Selection (RFC 2461 6.3.6)
448  */
449 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
450 {
451         struct net_device *dev = rt->dst.dev;
452         if (!oif || dev->ifindex == oif)
453                 return 2;
454         if ((dev->flags & IFF_LOOPBACK) &&
455             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
456                 return 1;
457         return 0;
458 }
459
460 static inline int rt6_check_neigh(struct rt6_info *rt)
461 {
462         struct neighbour *neigh;
463         int m;
464
465         rcu_read_lock();
466         neigh = dst_get_neighbour_noref(&rt->dst);
467         if (rt->rt6i_flags & RTF_NONEXTHOP ||
468             !(rt->rt6i_flags & RTF_GATEWAY))
469                 m = 1;
470         else if (neigh) {
471                 read_lock_bh(&neigh->lock);
472                 if (neigh->nud_state & NUD_VALID)
473                         m = 2;
474 #ifdef CONFIG_IPV6_ROUTER_PREF
475                 else if (neigh->nud_state & NUD_FAILED)
476                         m = 0;
477 #endif
478                 else
479                         m = 1;
480                 read_unlock_bh(&neigh->lock);
481         } else
482                 m = 0;
483         rcu_read_unlock();
484         return m;
485 }
486
487 static int rt6_score_route(struct rt6_info *rt, int oif,
488                            int strict)
489 {
490         int m, n;
491
492         m = rt6_check_dev(rt, oif);
493         if (!m && (strict & RT6_LOOKUP_F_IFACE))
494                 return -1;
495 #ifdef CONFIG_IPV6_ROUTER_PREF
496         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
497 #endif
498         n = rt6_check_neigh(rt);
499         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
500                 return -1;
501         return m;
502 }
503
504 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
505                                    int *mpri, struct rt6_info *match)
506 {
507         int m;
508
509         if (rt6_check_expired(rt))
510                 goto out;
511
512         m = rt6_score_route(rt, oif, strict);
513         if (m < 0)
514                 goto out;
515
516         if (m > *mpri) {
517                 if (strict & RT6_LOOKUP_F_REACHABLE)
518                         rt6_probe(match);
519                 *mpri = m;
520                 match = rt;
521         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
522                 rt6_probe(rt);
523         }
524
525 out:
526         return match;
527 }
528
529 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
530                                      struct rt6_info *rr_head,
531                                      u32 metric, int oif, int strict)
532 {
533         struct rt6_info *rt, *match;
534         int mpri = -1;
535
536         match = NULL;
537         for (rt = rr_head; rt && rt->rt6i_metric == metric;
538              rt = rt->dst.rt6_next)
539                 match = find_match(rt, oif, strict, &mpri, match);
540         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
541              rt = rt->dst.rt6_next)
542                 match = find_match(rt, oif, strict, &mpri, match);
543
544         return match;
545 }
546
547 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
548 {
549         struct rt6_info *match, *rt0;
550         struct net *net;
551
552         rt0 = fn->rr_ptr;
553         if (!rt0)
554                 fn->rr_ptr = rt0 = fn->leaf;
555
556         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
557
558         if (!match &&
559             (strict & RT6_LOOKUP_F_REACHABLE)) {
560                 struct rt6_info *next = rt0->dst.rt6_next;
561
562                 /* no entries matched; do round-robin */
563                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
564                         next = fn->leaf;
565
566                 if (next != rt0)
567                         fn->rr_ptr = next;
568         }
569
570         net = dev_net(rt0->dst.dev);
571         return match ? match : net->ipv6.ip6_null_entry;
572 }
573
574 #ifdef CONFIG_IPV6_ROUTE_INFO
575 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
576                   const struct in6_addr *gwaddr)
577 {
578         struct net *net = dev_net(dev);
579         struct route_info *rinfo = (struct route_info *) opt;
580         struct in6_addr prefix_buf, *prefix;
581         unsigned int pref;
582         unsigned long lifetime;
583         struct rt6_info *rt;
584
585         if (len < sizeof(struct route_info)) {
586                 return -EINVAL;
587         }
588
589         /* Sanity check for prefix_len and length */
590         if (rinfo->length > 3) {
591                 return -EINVAL;
592         } else if (rinfo->prefix_len > 128) {
593                 return -EINVAL;
594         } else if (rinfo->prefix_len > 64) {
595                 if (rinfo->length < 2) {
596                         return -EINVAL;
597                 }
598         } else if (rinfo->prefix_len > 0) {
599                 if (rinfo->length < 1) {
600                         return -EINVAL;
601                 }
602         }
603
604         pref = rinfo->route_pref;
605         if (pref == ICMPV6_ROUTER_PREF_INVALID)
606                 return -EINVAL;
607
608         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
609
610         if (rinfo->length == 3)
611                 prefix = (struct in6_addr *)rinfo->prefix;
612         else {
613                 /* this function is safe */
614                 ipv6_addr_prefix(&prefix_buf,
615                                  (struct in6_addr *)rinfo->prefix,
616                                  rinfo->prefix_len);
617                 prefix = &prefix_buf;
618         }
619
620         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
621                                 dev->ifindex);
622
623         if (rt && !lifetime) {
624                 ip6_del_rt(rt);
625                 rt = NULL;
626         }
627
628         if (!rt && lifetime)
629                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
630                                         pref);
631         else if (rt)
632                 rt->rt6i_flags = RTF_ROUTEINFO |
633                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
634
635         if (rt) {
636                 if (!addrconf_finite_timeout(lifetime))
637                         rt6_clean_expires(rt);
638                 else
639                         rt6_set_expires(rt, jiffies + HZ * lifetime);
640
641                 dst_release(&rt->dst);
642         }
643         return 0;
644 }
645 #endif
646
647 #define BACKTRACK(__net, saddr)                 \
648 do { \
649         if (rt == __net->ipv6.ip6_null_entry) { \
650                 struct fib6_node *pn; \
651                 while (1) { \
652                         if (fn->fn_flags & RTN_TL_ROOT) \
653                                 goto out; \
654                         pn = fn->parent; \
655                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
656                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
657                         else \
658                                 fn = pn; \
659                         if (fn->fn_flags & RTN_RTINFO) \
660                                 goto restart; \
661                 } \
662         } \
663 } while (0)
664
665 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
666                                              struct fib6_table *table,
667                                              struct flowi6 *fl6, int flags)
668 {
669         struct fib6_node *fn;
670         struct rt6_info *rt;
671
672         read_lock_bh(&table->tb6_lock);
673         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
674 restart:
675         rt = fn->leaf;
676         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
677         BACKTRACK(net, &fl6->saddr);
678 out:
679         dst_use(&rt->dst, jiffies);
680         read_unlock_bh(&table->tb6_lock);
681         return rt;
682
683 }
684
685 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
686                                     int flags)
687 {
688         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
689 }
690 EXPORT_SYMBOL_GPL(ip6_route_lookup);
691
692 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
693                             const struct in6_addr *saddr, int oif, int strict)
694 {
695         struct flowi6 fl6 = {
696                 .flowi6_oif = oif,
697                 .daddr = *daddr,
698         };
699         struct dst_entry *dst;
700         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
701
702         if (saddr) {
703                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
704                 flags |= RT6_LOOKUP_F_HAS_SADDR;
705         }
706
707         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
708         if (dst->error == 0)
709                 return (struct rt6_info *) dst;
710
711         dst_release(dst);
712
713         return NULL;
714 }
715
716 EXPORT_SYMBOL(rt6_lookup);
717
718 /* ip6_ins_rt is called with FREE table->tb6_lock.
719    It takes new route entry, the addition fails by any reason the
720    route is freed. In any case, if caller does not hold it, it may
721    be destroyed.
722  */
723
724 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
725 {
726         int err;
727         struct fib6_table *table;
728
729         table = rt->rt6i_table;
730         write_lock_bh(&table->tb6_lock);
731         err = fib6_add(&table->tb6_root, rt, info);
732         write_unlock_bh(&table->tb6_lock);
733
734         return err;
735 }
736
737 int ip6_ins_rt(struct rt6_info *rt)
738 {
739         struct nl_info info = {
740                 .nl_net = dev_net(rt->dst.dev),
741         };
742         return __ip6_ins_rt(rt, &info);
743 }
744
745 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
746                                       const struct in6_addr *daddr,
747                                       const struct in6_addr *saddr)
748 {
749         struct rt6_info *rt;
750
751         /*
752          *      Clone the route.
753          */
754
755         rt = ip6_rt_copy(ort, daddr);
756
757         if (rt) {
758                 int attempts = !in_softirq();
759
760                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
761                         if (ort->rt6i_dst.plen != 128 &&
762                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
763                                 rt->rt6i_flags |= RTF_ANYCAST;
764                         rt->rt6i_gateway = *daddr;
765                 }
766
767                 rt->rt6i_flags |= RTF_CACHE;
768
769 #ifdef CONFIG_IPV6_SUBTREES
770                 if (rt->rt6i_src.plen && saddr) {
771                         rt->rt6i_src.addr = *saddr;
772                         rt->rt6i_src.plen = 128;
773                 }
774 #endif
775
776         retry:
777                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
778                         struct net *net = dev_net(rt->dst.dev);
779                         int saved_rt_min_interval =
780                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
781                         int saved_rt_elasticity =
782                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
783
784                         if (attempts-- > 0) {
785                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
786                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
787
788                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
789
790                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
791                                         saved_rt_elasticity;
792                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
793                                         saved_rt_min_interval;
794                                 goto retry;
795                         }
796
797                         if (net_ratelimit())
798                                 printk(KERN_WARNING
799                                        "ipv6: Neighbour table overflow.\n");
800                         dst_free(&rt->dst);
801                         return NULL;
802                 }
803         }
804
805         return rt;
806 }
807
808 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
809                                         const struct in6_addr *daddr)
810 {
811         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
812
813         if (rt) {
814                 rt->rt6i_flags |= RTF_CACHE;
815                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
816         }
817         return rt;
818 }
819
820 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
821                                       struct flowi6 *fl6, int flags)
822 {
823         struct fib6_node *fn;
824         struct rt6_info *rt, *nrt;
825         int strict = 0;
826         int attempts = 3;
827         int err;
828         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
829
830         strict |= flags & RT6_LOOKUP_F_IFACE;
831
832 relookup:
833         read_lock_bh(&table->tb6_lock);
834
835 restart_2:
836         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
837
838 restart:
839         rt = rt6_select(fn, oif, strict | reachable);
840
841         BACKTRACK(net, &fl6->saddr);
842         if (rt == net->ipv6.ip6_null_entry ||
843             rt->rt6i_flags & RTF_CACHE)
844                 goto out;
845
846         dst_hold(&rt->dst);
847         read_unlock_bh(&table->tb6_lock);
848
849         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
850                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
851         else if (!(rt->dst.flags & DST_HOST))
852                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
853         else
854                 goto out2;
855
856         dst_release(&rt->dst);
857         rt = nrt ? : net->ipv6.ip6_null_entry;
858
859         dst_hold(&rt->dst);
860         if (nrt) {
861                 err = ip6_ins_rt(nrt);
862                 if (!err)
863                         goto out2;
864         }
865
866         if (--attempts <= 0)
867                 goto out2;
868
869         /*
870          * Race condition! In the gap, when table->tb6_lock was
871          * released someone could insert this route.  Relookup.
872          */
873         dst_release(&rt->dst);
874         goto relookup;
875
876 out:
877         if (reachable) {
878                 reachable = 0;
879                 goto restart_2;
880         }
881         dst_hold(&rt->dst);
882         read_unlock_bh(&table->tb6_lock);
883 out2:
884         rt->dst.lastuse = jiffies;
885         rt->dst.__use++;
886
887         return rt;
888 }
889
890 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
891                                             struct flowi6 *fl6, int flags)
892 {
893         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
894 }
895
896 static struct dst_entry *ip6_route_input_lookup(struct net *net,
897                                                 struct net_device *dev,
898                                                 struct flowi6 *fl6, int flags)
899 {
900         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
901                 flags |= RT6_LOOKUP_F_IFACE;
902
903         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
904 }
905
906 void ip6_route_input(struct sk_buff *skb)
907 {
908         const struct ipv6hdr *iph = ipv6_hdr(skb);
909         struct net *net = dev_net(skb->dev);
910         int flags = RT6_LOOKUP_F_HAS_SADDR;
911         struct flowi6 fl6 = {
912                 .flowi6_iif = skb->dev->ifindex,
913                 .daddr = iph->daddr,
914                 .saddr = iph->saddr,
915                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
916                 .flowi6_mark = skb->mark,
917                 .flowi6_proto = iph->nexthdr,
918         };
919
920         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
921 }
922
923 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
924                                              struct flowi6 *fl6, int flags)
925 {
926         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
927 }
928
929 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
930                                     struct flowi6 *fl6)
931 {
932         int flags = 0;
933
934         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
935                 flags |= RT6_LOOKUP_F_IFACE;
936
937         if (!ipv6_addr_any(&fl6->saddr))
938                 flags |= RT6_LOOKUP_F_HAS_SADDR;
939         else if (sk)
940                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
941
942         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
943 }
944
945 EXPORT_SYMBOL(ip6_route_output);
946
947 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
948 {
949         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
950         struct dst_entry *new = NULL;
951
952         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
953         if (rt) {
954                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
955
956                 new = &rt->dst;
957
958                 new->__use = 1;
959                 new->input = dst_discard;
960                 new->output = dst_discard;
961
962                 if (dst_metrics_read_only(&ort->dst))
963                         new->_metrics = ort->dst._metrics;
964                 else
965                         dst_copy_metrics(new, &ort->dst);
966                 rt->rt6i_idev = ort->rt6i_idev;
967                 if (rt->rt6i_idev)
968                         in6_dev_hold(rt->rt6i_idev);
969
970                 rt->rt6i_gateway = ort->rt6i_gateway;
971                 rt->rt6i_flags = ort->rt6i_flags;
972                 rt6_clean_expires(rt);
973                 rt->rt6i_metric = 0;
974
975                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
976 #ifdef CONFIG_IPV6_SUBTREES
977                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
978 #endif
979
980                 dst_free(new);
981         }
982
983         dst_release(dst_orig);
984         return new ? new : ERR_PTR(-ENOMEM);
985 }
986
987 /*
988  *      Destination cache support functions
989  */
990
991 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
992 {
993         struct rt6_info *rt;
994
995         rt = (struct rt6_info *) dst;
996
997         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
998                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
999                         if (!rt->rt6i_peer)
1000                                 rt6_bind_peer(rt, 0);
1001                         rt->rt6i_peer_genid = rt6_peer_genid();
1002                 }
1003                 return dst;
1004         }
1005         return NULL;
1006 }
1007
1008 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1009 {
1010         struct rt6_info *rt = (struct rt6_info *) dst;
1011
1012         if (rt) {
1013                 if (rt->rt6i_flags & RTF_CACHE) {
1014                         if (rt6_check_expired(rt)) {
1015                                 ip6_del_rt(rt);
1016                                 dst = NULL;
1017                         }
1018                 } else {
1019                         dst_release(dst);
1020                         dst = NULL;
1021                 }
1022         }
1023         return dst;
1024 }
1025
1026 static void ip6_link_failure(struct sk_buff *skb)
1027 {
1028         struct rt6_info *rt;
1029
1030         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1031
1032         rt = (struct rt6_info *) skb_dst(skb);
1033         if (rt) {
1034                 if (rt->rt6i_flags & RTF_CACHE)
1035                         rt6_update_expires(rt, 0);
1036                 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1037                         rt->rt6i_node->fn_sernum = -1;
1038         }
1039 }
1040
1041 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1042 {
1043         struct rt6_info *rt6 = (struct rt6_info*)dst;
1044
1045         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1046                 rt6->rt6i_flags |= RTF_MODIFIED;
1047                 if (mtu < IPV6_MIN_MTU) {
1048                         u32 features = dst_metric(dst, RTAX_FEATURES);
1049                         mtu = IPV6_MIN_MTU;
1050                         features |= RTAX_FEATURE_ALLFRAG;
1051                         dst_metric_set(dst, RTAX_FEATURES, features);
1052                 }
1053                 dst_metric_set(dst, RTAX_MTU, mtu);
1054         }
1055 }
1056
1057 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1058 {
1059         struct net_device *dev = dst->dev;
1060         unsigned int mtu = dst_mtu(dst);
1061         struct net *net = dev_net(dev);
1062
1063         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1064
1065         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1066                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1067
1068         /*
1069          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1070          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1071          * IPV6_MAXPLEN is also valid and means: "any MSS,
1072          * rely only on pmtu discovery"
1073          */
1074         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1075                 mtu = IPV6_MAXPLEN;
1076         return mtu;
1077 }
1078
1079 static unsigned int ip6_mtu(const struct dst_entry *dst)
1080 {
1081         struct inet6_dev *idev;
1082         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1083
1084         if (mtu)
1085                 return mtu;
1086
1087         mtu = IPV6_MIN_MTU;
1088
1089         rcu_read_lock();
1090         idev = __in6_dev_get(dst->dev);
1091         if (idev)
1092                 mtu = idev->cnf.mtu6;
1093         rcu_read_unlock();
1094
1095         return mtu;
1096 }
1097
1098 static struct dst_entry *icmp6_dst_gc_list;
1099 static DEFINE_SPINLOCK(icmp6_dst_lock);
1100
1101 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1102                                   struct neighbour *neigh,
1103                                   struct flowi6 *fl6)
1104 {
1105         struct dst_entry *dst;
1106         struct rt6_info *rt;
1107         struct inet6_dev *idev = in6_dev_get(dev);
1108         struct net *net = dev_net(dev);
1109
1110         if (unlikely(!idev))
1111                 return ERR_PTR(-ENODEV);
1112
1113         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1114         if (unlikely(!rt)) {
1115                 in6_dev_put(idev);
1116                 dst = ERR_PTR(-ENOMEM);
1117                 goto out;
1118         }
1119
1120         if (neigh)
1121                 neigh_hold(neigh);
1122         else {
1123                 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1124                 if (IS_ERR(neigh)) {
1125                         in6_dev_put(idev);
1126                         dst_free(&rt->dst);
1127                         return ERR_CAST(neigh);
1128                 }
1129         }
1130
1131         rt->dst.flags |= DST_HOST;
1132         rt->dst.output  = ip6_output;
1133         dst_set_neighbour(&rt->dst, neigh);
1134         atomic_set(&rt->dst.__refcnt, 1);
1135         rt->rt6i_dst.addr = fl6->daddr;
1136         rt->rt6i_dst.plen = 128;
1137         rt->rt6i_idev     = idev;
1138         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1139
1140         spin_lock_bh(&icmp6_dst_lock);
1141         rt->dst.next = icmp6_dst_gc_list;
1142         icmp6_dst_gc_list = &rt->dst;
1143         spin_unlock_bh(&icmp6_dst_lock);
1144
1145         fib6_force_start_gc(net);
1146
1147         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1148
1149 out:
1150         return dst;
1151 }
1152
1153 int icmp6_dst_gc(void)
1154 {
1155         struct dst_entry *dst, **pprev;
1156         int more = 0;
1157
1158         spin_lock_bh(&icmp6_dst_lock);
1159         pprev = &icmp6_dst_gc_list;
1160
1161         while ((dst = *pprev) != NULL) {
1162                 if (!atomic_read(&dst->__refcnt)) {
1163                         *pprev = dst->next;
1164                         dst_free(dst);
1165                 } else {
1166                         pprev = &dst->next;
1167                         ++more;
1168                 }
1169         }
1170
1171         spin_unlock_bh(&icmp6_dst_lock);
1172
1173         return more;
1174 }
1175
1176 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1177                             void *arg)
1178 {
1179         struct dst_entry *dst, **pprev;
1180
1181         spin_lock_bh(&icmp6_dst_lock);
1182         pprev = &icmp6_dst_gc_list;
1183         while ((dst = *pprev) != NULL) {
1184                 struct rt6_info *rt = (struct rt6_info *) dst;
1185                 if (func(rt, arg)) {
1186                         *pprev = dst->next;
1187                         dst_free(dst);
1188                 } else {
1189                         pprev = &dst->next;
1190                 }
1191         }
1192         spin_unlock_bh(&icmp6_dst_lock);
1193 }
1194
1195 static int ip6_dst_gc(struct dst_ops *ops)
1196 {
1197         unsigned long now = jiffies;
1198         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1199         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1200         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1201         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1202         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1203         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1204         int entries;
1205
1206         entries = dst_entries_get_fast(ops);
1207         if (time_after(rt_last_gc + rt_min_interval, now) &&
1208             entries <= rt_max_size)
1209                 goto out;
1210
1211         net->ipv6.ip6_rt_gc_expire++;
1212         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1213         net->ipv6.ip6_rt_last_gc = now;
1214         entries = dst_entries_get_slow(ops);
1215         if (entries < ops->gc_thresh)
1216                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1217 out:
1218         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1219         return entries > rt_max_size;
1220 }
1221
1222 /* Clean host part of a prefix. Not necessary in radix tree,
1223    but results in cleaner routing tables.
1224
1225    Remove it only when all the things will work!
1226  */
1227
1228 int ip6_dst_hoplimit(struct dst_entry *dst)
1229 {
1230         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1231         if (hoplimit == 0) {
1232                 struct net_device *dev = dst->dev;
1233                 struct inet6_dev *idev;
1234
1235                 rcu_read_lock();
1236                 idev = __in6_dev_get(dev);
1237                 if (idev)
1238                         hoplimit = idev->cnf.hop_limit;
1239                 else
1240                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1241                 rcu_read_unlock();
1242         }
1243         return hoplimit;
1244 }
1245 EXPORT_SYMBOL(ip6_dst_hoplimit);
1246
1247 /*
1248  *
1249  */
1250
1251 int ip6_route_add(struct fib6_config *cfg)
1252 {
1253         int err;
1254         struct net *net = cfg->fc_nlinfo.nl_net;
1255         struct rt6_info *rt = NULL;
1256         struct net_device *dev = NULL;
1257         struct inet6_dev *idev = NULL;
1258         struct fib6_table *table;
1259         int addr_type;
1260
1261         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1262                 return -EINVAL;
1263 #ifndef CONFIG_IPV6_SUBTREES
1264         if (cfg->fc_src_len)
1265                 return -EINVAL;
1266 #endif
1267         if (cfg->fc_ifindex) {
1268                 err = -ENODEV;
1269                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1270                 if (!dev)
1271                         goto out;
1272                 idev = in6_dev_get(dev);
1273                 if (!idev)
1274                         goto out;
1275         }
1276
1277         if (cfg->fc_metric == 0)
1278                 cfg->fc_metric = IP6_RT_PRIO_USER;
1279
1280         err = -ENOBUFS;
1281         if (cfg->fc_nlinfo.nlh &&
1282             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1283                 table = fib6_get_table(net, cfg->fc_table);
1284                 if (!table) {
1285                         printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1286                         table = fib6_new_table(net, cfg->fc_table);
1287                 }
1288         } else {
1289                 table = fib6_new_table(net, cfg->fc_table);
1290         }
1291
1292         if (!table)
1293                 goto out;
1294
1295         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1296
1297         if (!rt) {
1298                 err = -ENOMEM;
1299                 goto out;
1300         }
1301
1302         rt->dst.obsolete = -1;
1303
1304         if (cfg->fc_flags & RTF_EXPIRES)
1305                 rt6_set_expires(rt, jiffies +
1306                                 clock_t_to_jiffies(cfg->fc_expires));
1307         else
1308                 rt6_clean_expires(rt);
1309
1310         if (cfg->fc_protocol == RTPROT_UNSPEC)
1311                 cfg->fc_protocol = RTPROT_BOOT;
1312         rt->rt6i_protocol = cfg->fc_protocol;
1313
1314         addr_type = ipv6_addr_type(&cfg->fc_dst);
1315
1316         if (addr_type & IPV6_ADDR_MULTICAST)
1317                 rt->dst.input = ip6_mc_input;
1318         else if (cfg->fc_flags & RTF_LOCAL)
1319                 rt->dst.input = ip6_input;
1320         else
1321                 rt->dst.input = ip6_forward;
1322
1323         rt->dst.output = ip6_output;
1324
1325         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1326         rt->rt6i_dst.plen = cfg->fc_dst_len;
1327         if (rt->rt6i_dst.plen == 128)
1328                rt->dst.flags |= DST_HOST;
1329
1330         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1331                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1332                 if (!metrics) {
1333                         err = -ENOMEM;
1334                         goto out;
1335                 }
1336                 dst_init_metrics(&rt->dst, metrics, 0);
1337         }
1338 #ifdef CONFIG_IPV6_SUBTREES
1339         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1340         rt->rt6i_src.plen = cfg->fc_src_len;
1341 #endif
1342
1343         rt->rt6i_metric = cfg->fc_metric;
1344
1345         /* We cannot add true routes via loopback here,
1346            they would result in kernel looping; promote them to reject routes
1347          */
1348         if ((cfg->fc_flags & RTF_REJECT) ||
1349             (dev && (dev->flags & IFF_LOOPBACK) &&
1350              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1351              !(cfg->fc_flags & RTF_LOCAL))) {
1352                 /* hold loopback dev/idev if we haven't done so. */
1353                 if (dev != net->loopback_dev) {
1354                         if (dev) {
1355                                 dev_put(dev);
1356                                 in6_dev_put(idev);
1357                         }
1358                         dev = net->loopback_dev;
1359                         dev_hold(dev);
1360                         idev = in6_dev_get(dev);
1361                         if (!idev) {
1362                                 err = -ENODEV;
1363                                 goto out;
1364                         }
1365                 }
1366                 rt->dst.output = ip6_pkt_discard_out;
1367                 rt->dst.input = ip6_pkt_discard;
1368                 rt->dst.error = -ENETUNREACH;
1369                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1370                 goto install_route;
1371         }
1372
1373         if (cfg->fc_flags & RTF_GATEWAY) {
1374                 const struct in6_addr *gw_addr;
1375                 int gwa_type;
1376
1377                 gw_addr = &cfg->fc_gateway;
1378                 rt->rt6i_gateway = *gw_addr;
1379                 gwa_type = ipv6_addr_type(gw_addr);
1380
1381                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1382                         struct rt6_info *grt;
1383
1384                         /* IPv6 strictly inhibits using not link-local
1385                            addresses as nexthop address.
1386                            Otherwise, router will not able to send redirects.
1387                            It is very good, but in some (rare!) circumstances
1388                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1389                            some exceptions. --ANK
1390                          */
1391                         err = -EINVAL;
1392                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1393                                 goto out;
1394
1395                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1396
1397                         err = -EHOSTUNREACH;
1398                         if (!grt)
1399                                 goto out;
1400                         if (dev) {
1401                                 if (dev != grt->dst.dev) {
1402                                         dst_release(&grt->dst);
1403                                         goto out;
1404                                 }
1405                         } else {
1406                                 dev = grt->dst.dev;
1407                                 idev = grt->rt6i_idev;
1408                                 dev_hold(dev);
1409                                 in6_dev_hold(grt->rt6i_idev);
1410                         }
1411                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1412                                 err = 0;
1413                         dst_release(&grt->dst);
1414
1415                         if (err)
1416                                 goto out;
1417                 }
1418                 err = -EINVAL;
1419                 if (!dev || (dev->flags & IFF_LOOPBACK))
1420                         goto out;
1421         }
1422
1423         err = -ENODEV;
1424         if (!dev)
1425                 goto out;
1426
1427         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1428                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1429                         err = -EINVAL;
1430                         goto out;
1431                 }
1432                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1433                 rt->rt6i_prefsrc.plen = 128;
1434         } else
1435                 rt->rt6i_prefsrc.plen = 0;
1436
1437         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1438                 err = rt6_bind_neighbour(rt, dev);
1439                 if (err)
1440                         goto out;
1441         }
1442
1443         rt->rt6i_flags = cfg->fc_flags;
1444
1445 install_route:
1446         if (cfg->fc_mx) {
1447                 struct nlattr *nla;
1448                 int remaining;
1449
1450                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1451                         int type = nla_type(nla);
1452
1453                         if (type) {
1454                                 if (type > RTAX_MAX) {
1455                                         err = -EINVAL;
1456                                         goto out;
1457                                 }
1458
1459                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1460                         }
1461                 }
1462         }
1463
1464         rt->dst.dev = dev;
1465         rt->rt6i_idev = idev;
1466         rt->rt6i_table = table;
1467
1468         cfg->fc_nlinfo.nl_net = dev_net(dev);
1469
1470         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1471
1472 out:
1473         if (dev)
1474                 dev_put(dev);
1475         if (idev)
1476                 in6_dev_put(idev);
1477         if (rt)
1478                 dst_free(&rt->dst);
1479         return err;
1480 }
1481
1482 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1483 {
1484         int err;
1485         struct fib6_table *table;
1486         struct net *net = dev_net(rt->dst.dev);
1487
1488         if (rt == net->ipv6.ip6_null_entry) {
1489                 err = -ENOENT;
1490                 goto out;
1491         }
1492
1493         table = rt->rt6i_table;
1494         write_lock_bh(&table->tb6_lock);
1495         err = fib6_del(rt, info);
1496         write_unlock_bh(&table->tb6_lock);
1497
1498 out:
1499         dst_release(&rt->dst);
1500         return err;
1501 }
1502
1503 int ip6_del_rt(struct rt6_info *rt)
1504 {
1505         struct nl_info info = {
1506                 .nl_net = dev_net(rt->dst.dev),
1507         };
1508         return __ip6_del_rt(rt, &info);
1509 }
1510
1511 static int ip6_route_del(struct fib6_config *cfg)
1512 {
1513         struct fib6_table *table;
1514         struct fib6_node *fn;
1515         struct rt6_info *rt;
1516         int err = -ESRCH;
1517
1518         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1519         if (!table)
1520                 return err;
1521
1522         read_lock_bh(&table->tb6_lock);
1523
1524         fn = fib6_locate(&table->tb6_root,
1525                          &cfg->fc_dst, cfg->fc_dst_len,
1526                          &cfg->fc_src, cfg->fc_src_len);
1527
1528         if (fn) {
1529                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1530                         if (cfg->fc_ifindex &&
1531                             (!rt->dst.dev ||
1532                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1533                                 continue;
1534                         if (cfg->fc_flags & RTF_GATEWAY &&
1535                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1536                                 continue;
1537                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1538                                 continue;
1539                         dst_hold(&rt->dst);
1540                         read_unlock_bh(&table->tb6_lock);
1541
1542                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1543                 }
1544         }
1545         read_unlock_bh(&table->tb6_lock);
1546
1547         return err;
1548 }
1549
1550 /*
1551  *      Handle redirects
1552  */
1553 struct ip6rd_flowi {
1554         struct flowi6 fl6;
1555         struct in6_addr gateway;
1556 };
1557
1558 static struct rt6_info *__ip6_route_redirect(struct net *net,
1559                                              struct fib6_table *table,
1560                                              struct flowi6 *fl6,
1561                                              int flags)
1562 {
1563         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1564         struct rt6_info *rt;
1565         struct fib6_node *fn;
1566
1567         /*
1568          * Get the "current" route for this destination and
1569          * check if the redirect has come from approriate router.
1570          *
1571          * RFC 2461 specifies that redirects should only be
1572          * accepted if they come from the nexthop to the target.
1573          * Due to the way the routes are chosen, this notion
1574          * is a bit fuzzy and one might need to check all possible
1575          * routes.
1576          */
1577
1578         read_lock_bh(&table->tb6_lock);
1579         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1580 restart:
1581         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1582                 /*
1583                  * Current route is on-link; redirect is always invalid.
1584                  *
1585                  * Seems, previous statement is not true. It could
1586                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1587                  * But then router serving it might decide, that we should
1588                  * know truth 8)8) --ANK (980726).
1589                  */
1590                 if (rt6_check_expired(rt))
1591                         continue;
1592                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1593                         continue;
1594                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1595                         continue;
1596                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1597                         continue;
1598                 break;
1599         }
1600
1601         if (!rt)
1602                 rt = net->ipv6.ip6_null_entry;
1603         BACKTRACK(net, &fl6->saddr);
1604 out:
1605         dst_hold(&rt->dst);
1606
1607         read_unlock_bh(&table->tb6_lock);
1608
1609         return rt;
1610 };
1611
1612 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1613                                            const struct in6_addr *src,
1614                                            const struct in6_addr *gateway,
1615                                            struct net_device *dev)
1616 {
1617         int flags = RT6_LOOKUP_F_HAS_SADDR;
1618         struct net *net = dev_net(dev);
1619         struct ip6rd_flowi rdfl = {
1620                 .fl6 = {
1621                         .flowi6_oif = dev->ifindex,
1622                         .daddr = *dest,
1623                         .saddr = *src,
1624                 },
1625         };
1626
1627         rdfl.gateway = *gateway;
1628
1629         if (rt6_need_strict(dest))
1630                 flags |= RT6_LOOKUP_F_IFACE;
1631
1632         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1633                                                    flags, __ip6_route_redirect);
1634 }
1635
1636 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1637                   const struct in6_addr *saddr,
1638                   struct neighbour *neigh, u8 *lladdr, int on_link)
1639 {
1640         struct rt6_info *rt, *nrt = NULL;
1641         struct netevent_redirect netevent;
1642         struct net *net = dev_net(neigh->dev);
1643
1644         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1645
1646         if (rt == net->ipv6.ip6_null_entry) {
1647                 if (net_ratelimit())
1648                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1649                                "for redirect target\n");
1650                 goto out;
1651         }
1652
1653         /*
1654          *      We have finally decided to accept it.
1655          */
1656
1657         neigh_update(neigh, lladdr, NUD_STALE,
1658                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1659                      NEIGH_UPDATE_F_OVERRIDE|
1660                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1661                                      NEIGH_UPDATE_F_ISROUTER))
1662                      );
1663
1664         /*
1665          * Redirect received -> path was valid.
1666          * Look, redirects are sent only in response to data packets,
1667          * so that this nexthop apparently is reachable. --ANK
1668          */
1669         dst_confirm(&rt->dst);
1670
1671         /* Duplicate redirect: silently ignore. */
1672         if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1673                 goto out;
1674
1675         nrt = ip6_rt_copy(rt, dest);
1676         if (!nrt)
1677                 goto out;
1678
1679         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1680         if (on_link)
1681                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1682
1683         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1684         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1685
1686         if (ip6_ins_rt(nrt))
1687                 goto out;
1688
1689         netevent.old = &rt->dst;
1690         netevent.new = &nrt->dst;
1691         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1692
1693         if (rt->rt6i_flags & RTF_CACHE) {
1694                 ip6_del_rt(rt);
1695                 return;
1696         }
1697
1698 out:
1699         dst_release(&rt->dst);
1700 }
1701
1702 /*
1703  *      Handle ICMP "packet too big" messages
1704  *      i.e. Path MTU discovery
1705  */
1706
1707 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1708                              struct net *net, u32 pmtu, int ifindex)
1709 {
1710         struct rt6_info *rt, *nrt;
1711         int allfrag = 0;
1712 again:
1713         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1714         if (!rt)
1715                 return;
1716
1717         if (rt6_check_expired(rt)) {
1718                 ip6_del_rt(rt);
1719                 goto again;
1720         }
1721
1722         if (pmtu >= dst_mtu(&rt->dst))
1723                 goto out;
1724
1725         if (pmtu < IPV6_MIN_MTU) {
1726                 /*
1727                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1728                  * MTU (1280) and a fragment header should always be included
1729                  * after a node receiving Too Big message reporting PMTU is
1730                  * less than the IPv6 Minimum Link MTU.
1731                  */
1732                 pmtu = IPV6_MIN_MTU;
1733                 allfrag = 1;
1734         }
1735
1736         /* New mtu received -> path was valid.
1737            They are sent only in response to data packets,
1738            so that this nexthop apparently is reachable. --ANK
1739          */
1740         dst_confirm(&rt->dst);
1741
1742         /* Host route. If it is static, it would be better
1743            not to override it, but add new one, so that
1744            when cache entry will expire old pmtu
1745            would return automatically.
1746          */
1747         if (rt->rt6i_flags & RTF_CACHE) {
1748                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1749                 if (allfrag) {
1750                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1751                         features |= RTAX_FEATURE_ALLFRAG;
1752                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1753                 }
1754                 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1755                 rt->rt6i_flags |= RTF_MODIFIED;
1756                 goto out;
1757         }
1758
1759         /* Network route.
1760            Two cases are possible:
1761            1. It is connected route. Action: COW
1762            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1763          */
1764         if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1765                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1766         else
1767                 nrt = rt6_alloc_clone(rt, daddr);
1768
1769         if (nrt) {
1770                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1771                 if (allfrag) {
1772                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1773                         features |= RTAX_FEATURE_ALLFRAG;
1774                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1775                 }
1776
1777                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1778                  * happened within 5 mins, the recommended timer is 10 mins.
1779                  * Here this route expiration time is set to ip6_rt_mtu_expires
1780                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1781                  * and detecting PMTU increase will be automatically happened.
1782                  */
1783                 rt6_update_expires(nrt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1784                 nrt->rt6i_flags |= RTF_DYNAMIC;
1785                 ip6_ins_rt(nrt);
1786         }
1787 out:
1788         dst_release(&rt->dst);
1789 }
1790
1791 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1792                         struct net_device *dev, u32 pmtu)
1793 {
1794         struct net *net = dev_net(dev);
1795
1796         /*
1797          * RFC 1981 states that a node "MUST reduce the size of the packets it
1798          * is sending along the path" that caused the Packet Too Big message.
1799          * Since it's not possible in the general case to determine which
1800          * interface was used to send the original packet, we update the MTU
1801          * on the interface that will be used to send future packets. We also
1802          * update the MTU on the interface that received the Packet Too Big in
1803          * case the original packet was forced out that interface with
1804          * SO_BINDTODEVICE or similar. This is the next best thing to the
1805          * correct behaviour, which would be to update the MTU on all
1806          * interfaces.
1807          */
1808         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1809         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1810 }
1811
1812 /*
1813  *      Misc support functions
1814  */
1815
1816 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1817                                     const struct in6_addr *dest)
1818 {
1819         struct net *net = dev_net(ort->dst.dev);
1820         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1821                                             ort->dst.dev, 0);
1822
1823         if (rt) {
1824                 rt->dst.input = ort->dst.input;
1825                 rt->dst.output = ort->dst.output;
1826                 rt->dst.flags |= DST_HOST;
1827
1828                 rt->rt6i_dst.addr = *dest;
1829                 rt->rt6i_dst.plen = 128;
1830                 dst_copy_metrics(&rt->dst, &ort->dst);
1831                 rt->dst.error = ort->dst.error;
1832                 rt->rt6i_idev = ort->rt6i_idev;
1833                 if (rt->rt6i_idev)
1834                         in6_dev_hold(rt->rt6i_idev);
1835                 rt->dst.lastuse = jiffies;
1836
1837                 rt->rt6i_gateway = ort->rt6i_gateway;
1838                 rt->rt6i_flags = ort->rt6i_flags;
1839                 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1840                     (RTF_DEFAULT | RTF_ADDRCONF))
1841                         rt6_set_from(rt, ort);
1842                 else
1843                         rt6_clean_expires(rt);
1844                 rt->rt6i_metric = 0;
1845
1846 #ifdef CONFIG_IPV6_SUBTREES
1847                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1848 #endif
1849                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1850                 rt->rt6i_table = ort->rt6i_table;
1851         }
1852         return rt;
1853 }
1854
1855 #ifdef CONFIG_IPV6_ROUTE_INFO
1856 static struct rt6_info *rt6_get_route_info(struct net *net,
1857                                            const struct in6_addr *prefix, int prefixlen,
1858                                            const struct in6_addr *gwaddr, int ifindex)
1859 {
1860         struct fib6_node *fn;
1861         struct rt6_info *rt = NULL;
1862         struct fib6_table *table;
1863
1864         table = fib6_get_table(net, RT6_TABLE_INFO);
1865         if (!table)
1866                 return NULL;
1867
1868         write_lock_bh(&table->tb6_lock);
1869         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1870         if (!fn)
1871                 goto out;
1872
1873         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1874                 if (rt->dst.dev->ifindex != ifindex)
1875                         continue;
1876                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1877                         continue;
1878                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1879                         continue;
1880                 dst_hold(&rt->dst);
1881                 break;
1882         }
1883 out:
1884         write_unlock_bh(&table->tb6_lock);
1885         return rt;
1886 }
1887
1888 static struct rt6_info *rt6_add_route_info(struct net *net,
1889                                            const struct in6_addr *prefix, int prefixlen,
1890                                            const struct in6_addr *gwaddr, int ifindex,
1891                                            unsigned pref)
1892 {
1893         struct fib6_config cfg = {
1894                 .fc_table       = RT6_TABLE_INFO,
1895                 .fc_metric      = IP6_RT_PRIO_USER,
1896                 .fc_ifindex     = ifindex,
1897                 .fc_dst_len     = prefixlen,
1898                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1899                                   RTF_UP | RTF_PREF(pref),
1900                 .fc_nlinfo.pid = 0,
1901                 .fc_nlinfo.nlh = NULL,
1902                 .fc_nlinfo.nl_net = net,
1903         };
1904
1905         cfg.fc_dst = *prefix;
1906         cfg.fc_gateway = *gwaddr;
1907
1908         /* We should treat it as a default route if prefix length is 0. */
1909         if (!prefixlen)
1910                 cfg.fc_flags |= RTF_DEFAULT;
1911
1912         ip6_route_add(&cfg);
1913
1914         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1915 }
1916 #endif
1917
1918 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1919 {
1920         struct rt6_info *rt;
1921         struct fib6_table *table;
1922
1923         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1924         if (!table)
1925                 return NULL;
1926
1927         write_lock_bh(&table->tb6_lock);
1928         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1929                 if (dev == rt->dst.dev &&
1930                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1931                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1932                         break;
1933         }
1934         if (rt)
1935                 dst_hold(&rt->dst);
1936         write_unlock_bh(&table->tb6_lock);
1937         return rt;
1938 }
1939
1940 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1941                                      struct net_device *dev,
1942                                      unsigned int pref)
1943 {
1944         struct fib6_config cfg = {
1945                 .fc_table       = RT6_TABLE_DFLT,
1946                 .fc_metric      = IP6_RT_PRIO_USER,
1947                 .fc_ifindex     = dev->ifindex,
1948                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1949                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1950                 .fc_nlinfo.pid = 0,
1951                 .fc_nlinfo.nlh = NULL,
1952                 .fc_nlinfo.nl_net = dev_net(dev),
1953         };
1954
1955         cfg.fc_gateway = *gwaddr;
1956
1957         ip6_route_add(&cfg);
1958
1959         return rt6_get_dflt_router(gwaddr, dev);
1960 }
1961
1962 void rt6_purge_dflt_routers(struct net *net)
1963 {
1964         struct rt6_info *rt;
1965         struct fib6_table *table;
1966
1967         /* NOTE: Keep consistent with rt6_get_dflt_router */
1968         table = fib6_get_table(net, RT6_TABLE_DFLT);
1969         if (!table)
1970                 return;
1971
1972 restart:
1973         read_lock_bh(&table->tb6_lock);
1974         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1975                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1976                         dst_hold(&rt->dst);
1977                         read_unlock_bh(&table->tb6_lock);
1978                         ip6_del_rt(rt);
1979                         goto restart;
1980                 }
1981         }
1982         read_unlock_bh(&table->tb6_lock);
1983 }
1984
1985 static void rtmsg_to_fib6_config(struct net *net,
1986                                  struct in6_rtmsg *rtmsg,
1987                                  struct fib6_config *cfg)
1988 {
1989         memset(cfg, 0, sizeof(*cfg));
1990
1991         cfg->fc_table = RT6_TABLE_MAIN;
1992         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1993         cfg->fc_metric = rtmsg->rtmsg_metric;
1994         cfg->fc_expires = rtmsg->rtmsg_info;
1995         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1996         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1997         cfg->fc_flags = rtmsg->rtmsg_flags;
1998
1999         cfg->fc_nlinfo.nl_net = net;
2000
2001         cfg->fc_dst = rtmsg->rtmsg_dst;
2002         cfg->fc_src = rtmsg->rtmsg_src;
2003         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2004 }
2005
2006 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2007 {
2008         struct fib6_config cfg;
2009         struct in6_rtmsg rtmsg;
2010         int err;
2011
2012         switch(cmd) {
2013         case SIOCADDRT:         /* Add a route */
2014         case SIOCDELRT:         /* Delete a route */
2015                 if (!capable(CAP_NET_ADMIN))
2016                         return -EPERM;
2017                 err = copy_from_user(&rtmsg, arg,
2018                                      sizeof(struct in6_rtmsg));
2019                 if (err)
2020                         return -EFAULT;
2021
2022                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2023
2024                 rtnl_lock();
2025                 switch (cmd) {
2026                 case SIOCADDRT:
2027                         err = ip6_route_add(&cfg);
2028                         break;
2029                 case SIOCDELRT:
2030                         err = ip6_route_del(&cfg);
2031                         break;
2032                 default:
2033                         err = -EINVAL;
2034                 }
2035                 rtnl_unlock();
2036
2037                 return err;
2038         }
2039
2040         return -EINVAL;
2041 }
2042
2043 /*
2044  *      Drop the packet on the floor
2045  */
2046
2047 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2048 {
2049         int type;
2050         struct dst_entry *dst = skb_dst(skb);
2051         switch (ipstats_mib_noroutes) {
2052         case IPSTATS_MIB_INNOROUTES:
2053                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2054                 if (type == IPV6_ADDR_ANY) {
2055                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2056                                       IPSTATS_MIB_INADDRERRORS);
2057                         break;
2058                 }
2059                 /* FALLTHROUGH */
2060         case IPSTATS_MIB_OUTNOROUTES:
2061                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2062                               ipstats_mib_noroutes);
2063                 break;
2064         }
2065         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2066         kfree_skb(skb);
2067         return 0;
2068 }
2069
2070 static int ip6_pkt_discard(struct sk_buff *skb)
2071 {
2072         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2073 }
2074
2075 static int ip6_pkt_discard_out(struct sk_buff *skb)
2076 {
2077         skb->dev = skb_dst(skb)->dev;
2078         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2079 }
2080
2081 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2082
2083 static int ip6_pkt_prohibit(struct sk_buff *skb)
2084 {
2085         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2086 }
2087
2088 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2089 {
2090         skb->dev = skb_dst(skb)->dev;
2091         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2092 }
2093
2094 #endif
2095
2096 /*
2097  *      Allocate a dst for local (unicast / anycast) address.
2098  */
2099
2100 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2101                                     const struct in6_addr *addr,
2102                                     bool anycast)
2103 {
2104         struct net *net = dev_net(idev->dev);
2105         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2106                                             net->loopback_dev, 0);
2107         int err;
2108
2109         if (!rt) {
2110                 if (net_ratelimit())
2111                         pr_warning("IPv6:  Maximum number of routes reached,"
2112                                    " consider increasing route/max_size.\n");
2113                 return ERR_PTR(-ENOMEM);
2114         }
2115
2116         in6_dev_hold(idev);
2117
2118         rt->dst.flags |= DST_HOST;
2119         rt->dst.input = ip6_input;
2120         rt->dst.output = ip6_output;
2121         rt->rt6i_idev = idev;
2122         rt->dst.obsolete = -1;
2123
2124         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2125         if (anycast)
2126                 rt->rt6i_flags |= RTF_ANYCAST;
2127         else
2128                 rt->rt6i_flags |= RTF_LOCAL;
2129         err = rt6_bind_neighbour(rt, rt->dst.dev);
2130         if (err) {
2131                 dst_free(&rt->dst);
2132                 return ERR_PTR(err);
2133         }
2134
2135         rt->rt6i_dst.addr = *addr;
2136         rt->rt6i_dst.plen = 128;
2137         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2138
2139         atomic_set(&rt->dst.__refcnt, 1);
2140
2141         return rt;
2142 }
2143
2144 int ip6_route_get_saddr(struct net *net,
2145                         struct rt6_info *rt,
2146                         const struct in6_addr *daddr,
2147                         unsigned int prefs,
2148                         struct in6_addr *saddr)
2149 {
2150         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2151         int err = 0;
2152         if (rt->rt6i_prefsrc.plen)
2153                 *saddr = rt->rt6i_prefsrc.addr;
2154         else
2155                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2156                                          daddr, prefs, saddr);
2157         return err;
2158 }
2159
2160 /* remove deleted ip from prefsrc entries */
2161 struct arg_dev_net_ip {
2162         struct net_device *dev;
2163         struct net *net;
2164         struct in6_addr *addr;
2165 };
2166
2167 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2168 {
2169         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2170         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2171         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2172
2173         if (((void *)rt->dst.dev == dev || !dev) &&
2174             rt != net->ipv6.ip6_null_entry &&
2175             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2176                 /* remove prefsrc entry */
2177                 rt->rt6i_prefsrc.plen = 0;
2178         }
2179         return 0;
2180 }
2181
2182 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2183 {
2184         struct net *net = dev_net(ifp->idev->dev);
2185         struct arg_dev_net_ip adni = {
2186                 .dev = ifp->idev->dev,
2187                 .net = net,
2188                 .addr = &ifp->addr,
2189         };
2190         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2191 }
2192
2193 struct arg_dev_net {
2194         struct net_device *dev;
2195         struct net *net;
2196 };
2197
2198 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2199 {
2200         const struct arg_dev_net *adn = arg;
2201         const struct net_device *dev = adn->dev;
2202
2203         if ((rt->dst.dev == dev || !dev) &&
2204             rt != adn->net->ipv6.ip6_null_entry)
2205                 return -1;
2206
2207         return 0;
2208 }
2209
2210 void rt6_ifdown(struct net *net, struct net_device *dev)
2211 {
2212         struct arg_dev_net adn = {
2213                 .dev = dev,
2214                 .net = net,
2215         };
2216
2217         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2218         icmp6_clean_all(fib6_ifdown, &adn);
2219 }
2220
2221 struct rt6_mtu_change_arg
2222 {
2223         struct net_device *dev;
2224         unsigned mtu;
2225 };
2226
2227 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2228 {
2229         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2230         struct inet6_dev *idev;
2231
2232         /* In IPv6 pmtu discovery is not optional,
2233            so that RTAX_MTU lock cannot disable it.
2234            We still use this lock to block changes
2235            caused by addrconf/ndisc.
2236         */
2237
2238         idev = __in6_dev_get(arg->dev);
2239         if (!idev)
2240                 return 0;
2241
2242         /* For administrative MTU increase, there is no way to discover
2243            IPv6 PMTU increase, so PMTU increase should be updated here.
2244            Since RFC 1981 doesn't include administrative MTU increase
2245            update PMTU increase is a MUST. (i.e. jumbo frame)
2246          */
2247         /*
2248            If new MTU is less than route PMTU, this new MTU will be the
2249            lowest MTU in the path, update the route PMTU to reflect PMTU
2250            decreases; if new MTU is greater than route PMTU, and the
2251            old MTU is the lowest MTU in the path, update the route PMTU
2252            to reflect the increase. In this case if the other nodes' MTU
2253            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2254            PMTU discouvery.
2255          */
2256         if (rt->dst.dev == arg->dev &&
2257             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2258             (dst_mtu(&rt->dst) >= arg->mtu ||
2259              (dst_mtu(&rt->dst) < arg->mtu &&
2260               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2261                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2262         }
2263         return 0;
2264 }
2265
2266 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2267 {
2268         struct rt6_mtu_change_arg arg = {
2269                 .dev = dev,
2270                 .mtu = mtu,
2271         };
2272
2273         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2274 }
2275
2276 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2277         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2278         [RTA_OIF]               = { .type = NLA_U32 },
2279         [RTA_IIF]               = { .type = NLA_U32 },
2280         [RTA_PRIORITY]          = { .type = NLA_U32 },
2281         [RTA_METRICS]           = { .type = NLA_NESTED },
2282 };
2283
2284 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2285                               struct fib6_config *cfg)
2286 {
2287         struct rtmsg *rtm;
2288         struct nlattr *tb[RTA_MAX+1];
2289         int err;
2290
2291         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2292         if (err < 0)
2293                 goto errout;
2294
2295         err = -EINVAL;
2296         rtm = nlmsg_data(nlh);
2297         memset(cfg, 0, sizeof(*cfg));
2298
2299         cfg->fc_table = rtm->rtm_table;
2300         cfg->fc_dst_len = rtm->rtm_dst_len;
2301         cfg->fc_src_len = rtm->rtm_src_len;
2302         cfg->fc_flags = RTF_UP;
2303         cfg->fc_protocol = rtm->rtm_protocol;
2304
2305         if (rtm->rtm_type == RTN_UNREACHABLE)
2306                 cfg->fc_flags |= RTF_REJECT;
2307
2308         if (rtm->rtm_type == RTN_LOCAL)
2309                 cfg->fc_flags |= RTF_LOCAL;
2310
2311         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2312         cfg->fc_nlinfo.nlh = nlh;
2313         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2314
2315         if (tb[RTA_GATEWAY]) {
2316                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2317                 cfg->fc_flags |= RTF_GATEWAY;
2318         }
2319
2320         if (tb[RTA_DST]) {
2321                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2322
2323                 if (nla_len(tb[RTA_DST]) < plen)
2324                         goto errout;
2325
2326                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2327         }
2328
2329         if (tb[RTA_SRC]) {
2330                 int plen = (rtm->rtm_src_len + 7) >> 3;
2331
2332                 if (nla_len(tb[RTA_SRC]) < plen)
2333                         goto errout;
2334
2335                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2336         }
2337
2338         if (tb[RTA_PREFSRC])
2339                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2340
2341         if (tb[RTA_OIF])
2342                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2343
2344         if (tb[RTA_PRIORITY])
2345                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2346
2347         if (tb[RTA_METRICS]) {
2348                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2349                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2350         }
2351
2352         if (tb[RTA_TABLE])
2353                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2354
2355         err = 0;
2356 errout:
2357         return err;
2358 }
2359
2360 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2361 {
2362         struct fib6_config cfg;
2363         int err;
2364
2365         err = rtm_to_fib6_config(skb, nlh, &cfg);
2366         if (err < 0)
2367                 return err;
2368
2369         return ip6_route_del(&cfg);
2370 }
2371
2372 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2373 {
2374         struct fib6_config cfg;
2375         int err;
2376
2377         err = rtm_to_fib6_config(skb, nlh, &cfg);
2378         if (err < 0)
2379                 return err;
2380
2381         return ip6_route_add(&cfg);
2382 }
2383
2384 static inline size_t rt6_nlmsg_size(void)
2385 {
2386         return NLMSG_ALIGN(sizeof(struct rtmsg))
2387                + nla_total_size(16) /* RTA_SRC */
2388                + nla_total_size(16) /* RTA_DST */
2389                + nla_total_size(16) /* RTA_GATEWAY */
2390                + nla_total_size(16) /* RTA_PREFSRC */
2391                + nla_total_size(4) /* RTA_TABLE */
2392                + nla_total_size(4) /* RTA_IIF */
2393                + nla_total_size(4) /* RTA_OIF */
2394                + nla_total_size(4) /* RTA_PRIORITY */
2395                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2396                + nla_total_size(sizeof(struct rta_cacheinfo));
2397 }
2398
2399 static int rt6_fill_node(struct net *net,
2400                          struct sk_buff *skb, struct rt6_info *rt,
2401                          struct in6_addr *dst, struct in6_addr *src,
2402                          int iif, int type, u32 pid, u32 seq,
2403                          int prefix, int nowait, unsigned int flags)
2404 {
2405         const struct inet_peer *peer;
2406         struct rtmsg *rtm;
2407         struct nlmsghdr *nlh;
2408         long expires;
2409         u32 table;
2410         struct neighbour *n;
2411         u32 ts, tsage;
2412
2413         if (prefix) {   /* user wants prefix routes only */
2414                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2415                         /* success since this is not a prefix route */
2416                         return 1;
2417                 }
2418         }
2419
2420         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2421         if (!nlh)
2422                 return -EMSGSIZE;
2423
2424         rtm = nlmsg_data(nlh);
2425         rtm->rtm_family = AF_INET6;
2426         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2427         rtm->rtm_src_len = rt->rt6i_src.plen;
2428         rtm->rtm_tos = 0;
2429         if (rt->rt6i_table)
2430                 table = rt->rt6i_table->tb6_id;
2431         else
2432                 table = RT6_TABLE_UNSPEC;
2433         rtm->rtm_table = table;
2434         NLA_PUT_U32(skb, RTA_TABLE, table);
2435         if (rt->rt6i_flags & RTF_REJECT)
2436                 rtm->rtm_type = RTN_UNREACHABLE;
2437         else if (rt->rt6i_flags & RTF_LOCAL)
2438                 rtm->rtm_type = RTN_LOCAL;
2439         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2440                 rtm->rtm_type = RTN_LOCAL;
2441         else
2442                 rtm->rtm_type = RTN_UNICAST;
2443         rtm->rtm_flags = 0;
2444         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2445         rtm->rtm_protocol = rt->rt6i_protocol;
2446         if (rt->rt6i_flags & RTF_DYNAMIC)
2447                 rtm->rtm_protocol = RTPROT_REDIRECT;
2448         else if (rt->rt6i_flags & RTF_ADDRCONF)
2449                 rtm->rtm_protocol = RTPROT_KERNEL;
2450         else if (rt->rt6i_flags & RTF_DEFAULT)
2451                 rtm->rtm_protocol = RTPROT_RA;
2452
2453         if (rt->rt6i_flags & RTF_CACHE)
2454                 rtm->rtm_flags |= RTM_F_CLONED;
2455
2456         if (dst) {
2457                 NLA_PUT(skb, RTA_DST, 16, dst);
2458                 rtm->rtm_dst_len = 128;
2459         } else if (rtm->rtm_dst_len)
2460                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2461 #ifdef CONFIG_IPV6_SUBTREES
2462         if (src) {
2463                 NLA_PUT(skb, RTA_SRC, 16, src);
2464                 rtm->rtm_src_len = 128;
2465         } else if (rtm->rtm_src_len)
2466                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2467 #endif
2468         if (iif) {
2469 #ifdef CONFIG_IPV6_MROUTE
2470                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2471                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2472                         if (err <= 0) {
2473                                 if (!nowait) {
2474                                         if (err == 0)
2475                                                 return 0;
2476                                         goto nla_put_failure;
2477                                 } else {
2478                                         if (err == -EMSGSIZE)
2479                                                 goto nla_put_failure;
2480                                 }
2481                         }
2482                 } else
2483 #endif
2484                         NLA_PUT_U32(skb, RTA_IIF, iif);
2485         } else if (dst) {
2486                 struct in6_addr saddr_buf;
2487                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2488                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2489         }
2490
2491         if (rt->rt6i_prefsrc.plen) {
2492                 struct in6_addr saddr_buf;
2493                 saddr_buf = rt->rt6i_prefsrc.addr;
2494                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2495         }
2496
2497         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2498                 goto nla_put_failure;
2499
2500         rcu_read_lock();
2501         n = dst_get_neighbour_noref(&rt->dst);
2502         if (n) {
2503                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2504                         rcu_read_unlock();
2505                         goto nla_put_failure;
2506                 }
2507         }
2508         rcu_read_unlock();
2509
2510         if (rt->dst.dev)
2511                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2512
2513         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2514
2515         if (!(rt->rt6i_flags & RTF_EXPIRES))
2516                 expires = 0;
2517         else if (rt->dst.expires - jiffies < INT_MAX)
2518                 expires = rt->dst.expires - jiffies;
2519         else
2520                 expires = INT_MAX;
2521
2522         peer = rt->rt6i_peer;
2523         ts = tsage = 0;
2524         if (peer && peer->tcp_ts_stamp) {
2525                 ts = peer->tcp_ts;
2526                 tsage = get_seconds() - peer->tcp_ts_stamp;
2527         }
2528
2529         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2530                                expires, rt->dst.error) < 0)
2531                 goto nla_put_failure;
2532
2533         return nlmsg_end(skb, nlh);
2534
2535 nla_put_failure:
2536         nlmsg_cancel(skb, nlh);
2537         return -EMSGSIZE;
2538 }
2539
2540 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2541 {
2542         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2543         int prefix;
2544
2545         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2546                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2547                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2548         } else
2549                 prefix = 0;
2550
2551         return rt6_fill_node(arg->net,
2552                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2553                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2554                      prefix, 0, NLM_F_MULTI);
2555 }
2556
2557 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2558 {
2559         struct net *net = sock_net(in_skb->sk);
2560         struct nlattr *tb[RTA_MAX+1];
2561         struct rt6_info *rt;
2562         struct sk_buff *skb;
2563         struct rtmsg *rtm;
2564         struct flowi6 fl6;
2565         int err, iif = 0, oif = 0;
2566
2567         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2568         if (err < 0)
2569                 goto errout;
2570
2571         err = -EINVAL;
2572         memset(&fl6, 0, sizeof(fl6));
2573
2574         if (tb[RTA_SRC]) {
2575                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2576                         goto errout;
2577
2578                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2579         }
2580
2581         if (tb[RTA_DST]) {
2582                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2583                         goto errout;
2584
2585                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2586         }
2587
2588         if (tb[RTA_IIF])
2589                 iif = nla_get_u32(tb[RTA_IIF]);
2590
2591         if (tb[RTA_OIF])
2592                 oif = nla_get_u32(tb[RTA_OIF]);
2593
2594         if (iif) {
2595                 struct net_device *dev;
2596                 int flags = 0;
2597
2598                 dev = __dev_get_by_index(net, iif);
2599                 if (!dev) {
2600                         err = -ENODEV;
2601                         goto errout;
2602                 }
2603
2604                 fl6.flowi6_iif = iif;
2605
2606                 if (!ipv6_addr_any(&fl6.saddr))
2607                         flags |= RT6_LOOKUP_F_HAS_SADDR;
2608
2609                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2610                                                                flags);
2611         } else {
2612                 fl6.flowi6_oif = oif;
2613
2614                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2615         }
2616
2617         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2618         if (!skb) {
2619                 err = -ENOBUFS;
2620                 goto errout;
2621         }
2622
2623         /* Reserve room for dummy headers, this skb can pass
2624            through good chunk of routing engine.
2625          */
2626         skb_reset_mac_header(skb);
2627         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2628
2629         skb_dst_set(skb, &rt->dst);
2630
2631         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2632                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2633                             nlh->nlmsg_seq, 0, 0, 0);
2634         if (err < 0) {
2635                 kfree_skb(skb);
2636                 goto errout;
2637         }
2638
2639         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2640 errout:
2641         return err;
2642 }
2643
2644 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2645 {
2646         struct sk_buff *skb;
2647         struct net *net = info->nl_net;
2648         u32 seq;
2649         int err;
2650
2651         err = -ENOBUFS;
2652         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2653
2654         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2655         if (!skb)
2656                 goto errout;
2657
2658         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2659                                 event, info->pid, seq, 0, 0, 0);
2660         if (err < 0) {
2661                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2662                 WARN_ON(err == -EMSGSIZE);
2663                 kfree_skb(skb);
2664                 goto errout;
2665         }
2666         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2667                     info->nlh, gfp_any());
2668         return;
2669 errout:
2670         if (err < 0)
2671                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2672 }
2673
2674 static int ip6_route_dev_notify(struct notifier_block *this,
2675                                 unsigned long event, void *data)
2676 {
2677         struct net_device *dev = (struct net_device *)data;
2678         struct net *net = dev_net(dev);
2679
2680         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2681                 net->ipv6.ip6_null_entry->dst.dev = dev;
2682                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2683 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2684                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2685                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2686                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2687                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2688 #endif
2689         }
2690
2691         return NOTIFY_OK;
2692 }
2693
2694 /*
2695  *      /proc
2696  */
2697
2698 #ifdef CONFIG_PROC_FS
2699
2700 struct rt6_proc_arg
2701 {
2702         char *buffer;
2703         int offset;
2704         int length;
2705         int skip;
2706         int len;
2707 };
2708
2709 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2710 {
2711         struct seq_file *m = p_arg;
2712         struct neighbour *n;
2713
2714         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2715
2716 #ifdef CONFIG_IPV6_SUBTREES
2717         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2718 #else
2719         seq_puts(m, "00000000000000000000000000000000 00 ");
2720 #endif
2721         rcu_read_lock();
2722         n = dst_get_neighbour_noref(&rt->dst);
2723         if (n) {
2724                 seq_printf(m, "%pi6", n->primary_key);
2725         } else {
2726                 seq_puts(m, "00000000000000000000000000000000");
2727         }
2728         rcu_read_unlock();
2729         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2730                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2731                    rt->dst.__use, rt->rt6i_flags,
2732                    rt->dst.dev ? rt->dst.dev->name : "");
2733         return 0;
2734 }
2735
2736 static int ipv6_route_show(struct seq_file *m, void *v)
2737 {
2738         struct net *net = (struct net *)m->private;
2739         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2740         return 0;
2741 }
2742
2743 static int ipv6_route_open(struct inode *inode, struct file *file)
2744 {
2745         return single_open_net(inode, file, ipv6_route_show);
2746 }
2747
2748 static const struct file_operations ipv6_route_proc_fops = {
2749         .owner          = THIS_MODULE,
2750         .open           = ipv6_route_open,
2751         .read           = seq_read,
2752         .llseek         = seq_lseek,
2753         .release        = single_release_net,
2754 };
2755
2756 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2757 {
2758         struct net *net = (struct net *)seq->private;
2759         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2760                    net->ipv6.rt6_stats->fib_nodes,
2761                    net->ipv6.rt6_stats->fib_route_nodes,
2762                    net->ipv6.rt6_stats->fib_rt_alloc,
2763                    net->ipv6.rt6_stats->fib_rt_entries,
2764                    net->ipv6.rt6_stats->fib_rt_cache,
2765                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2766                    net->ipv6.rt6_stats->fib_discarded_routes);
2767
2768         return 0;
2769 }
2770
2771 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2772 {
2773         return single_open_net(inode, file, rt6_stats_seq_show);
2774 }
2775
2776 static const struct file_operations rt6_stats_seq_fops = {
2777         .owner   = THIS_MODULE,
2778         .open    = rt6_stats_seq_open,
2779         .read    = seq_read,
2780         .llseek  = seq_lseek,
2781         .release = single_release_net,
2782 };
2783 #endif  /* CONFIG_PROC_FS */
2784
2785 #ifdef CONFIG_SYSCTL
2786
2787 static
2788 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2789                               void __user *buffer, size_t *lenp, loff_t *ppos)
2790 {
2791         struct net *net;
2792         int delay;
2793         if (!write)
2794                 return -EINVAL;
2795
2796         net = (struct net *)ctl->extra1;
2797         delay = net->ipv6.sysctl.flush_delay;
2798         proc_dointvec(ctl, write, buffer, lenp, ppos);
2799         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2800         return 0;
2801 }
2802
2803 ctl_table ipv6_route_table_template[] = {
2804         {
2805                 .procname       =       "flush",
2806                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2807                 .maxlen         =       sizeof(int),
2808                 .mode           =       0200,
2809                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2810         },
2811         {
2812                 .procname       =       "gc_thresh",
2813                 .data           =       &ip6_dst_ops_template.gc_thresh,
2814                 .maxlen         =       sizeof(int),
2815                 .mode           =       0644,
2816                 .proc_handler   =       proc_dointvec,
2817         },
2818         {
2819                 .procname       =       "max_size",
2820                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2821                 .maxlen         =       sizeof(int),
2822                 .mode           =       0644,
2823                 .proc_handler   =       proc_dointvec,
2824         },
2825         {
2826                 .procname       =       "gc_min_interval",
2827                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2828                 .maxlen         =       sizeof(int),
2829                 .mode           =       0644,
2830                 .proc_handler   =       proc_dointvec_jiffies,
2831         },
2832         {
2833                 .procname       =       "gc_timeout",
2834                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2835                 .maxlen         =       sizeof(int),
2836                 .mode           =       0644,
2837                 .proc_handler   =       proc_dointvec_jiffies,
2838         },
2839         {
2840                 .procname       =       "gc_interval",
2841                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2842                 .maxlen         =       sizeof(int),
2843                 .mode           =       0644,
2844                 .proc_handler   =       proc_dointvec_jiffies,
2845         },
2846         {
2847                 .procname       =       "gc_elasticity",
2848                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2849                 .maxlen         =       sizeof(int),
2850                 .mode           =       0644,
2851                 .proc_handler   =       proc_dointvec,
2852         },
2853         {
2854                 .procname       =       "mtu_expires",
2855                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2856                 .maxlen         =       sizeof(int),
2857                 .mode           =       0644,
2858                 .proc_handler   =       proc_dointvec_jiffies,
2859         },
2860         {
2861                 .procname       =       "min_adv_mss",
2862                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2863                 .maxlen         =       sizeof(int),
2864                 .mode           =       0644,
2865                 .proc_handler   =       proc_dointvec,
2866         },
2867         {
2868                 .procname       =       "gc_min_interval_ms",
2869                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2870                 .maxlen         =       sizeof(int),
2871                 .mode           =       0644,
2872                 .proc_handler   =       proc_dointvec_ms_jiffies,
2873         },
2874         { }
2875 };
2876
2877 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2878 {
2879         struct ctl_table *table;
2880
2881         table = kmemdup(ipv6_route_table_template,
2882                         sizeof(ipv6_route_table_template),
2883                         GFP_KERNEL);
2884
2885         if (table) {
2886                 table[0].data = &net->ipv6.sysctl.flush_delay;
2887                 table[0].extra1 = net;
2888                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2889                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2890                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2891                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2892                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2893                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2894                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2895                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2896                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2897         }
2898
2899         return table;
2900 }
2901 #endif
2902
2903 static int __net_init ip6_route_net_init(struct net *net)
2904 {
2905         int ret = -ENOMEM;
2906
2907         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2908                sizeof(net->ipv6.ip6_dst_ops));
2909
2910         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2911                 goto out_ip6_dst_ops;
2912
2913         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2914                                            sizeof(*net->ipv6.ip6_null_entry),
2915                                            GFP_KERNEL);
2916         if (!net->ipv6.ip6_null_entry)
2917                 goto out_ip6_dst_entries;
2918         net->ipv6.ip6_null_entry->dst.path =
2919                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2920         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2921         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2922                          ip6_template_metrics, true);
2923
2924 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2925         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2926                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2927                                                GFP_KERNEL);
2928         if (!net->ipv6.ip6_prohibit_entry)
2929                 goto out_ip6_null_entry;
2930         net->ipv6.ip6_prohibit_entry->dst.path =
2931                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2932         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2933         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2934                          ip6_template_metrics, true);
2935
2936         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2937                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2938                                                GFP_KERNEL);
2939         if (!net->ipv6.ip6_blk_hole_entry)
2940                 goto out_ip6_prohibit_entry;
2941         net->ipv6.ip6_blk_hole_entry->dst.path =
2942                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2943         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2944         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2945                          ip6_template_metrics, true);
2946 #endif
2947
2948         net->ipv6.sysctl.flush_delay = 0;
2949         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2950         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2951         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2952         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2953         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2954         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2955         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2956
2957         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2958
2959         ret = 0;
2960 out:
2961         return ret;
2962
2963 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2964 out_ip6_prohibit_entry:
2965         kfree(net->ipv6.ip6_prohibit_entry);
2966 out_ip6_null_entry:
2967         kfree(net->ipv6.ip6_null_entry);
2968 #endif
2969 out_ip6_dst_entries:
2970         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2971 out_ip6_dst_ops:
2972         goto out;
2973 }
2974
2975 static void __net_exit ip6_route_net_exit(struct net *net)
2976 {
2977         kfree(net->ipv6.ip6_null_entry);
2978 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2979         kfree(net->ipv6.ip6_prohibit_entry);
2980         kfree(net->ipv6.ip6_blk_hole_entry);
2981 #endif
2982         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2983 }
2984
2985 static int __net_init ip6_route_net_init_late(struct net *net)
2986 {
2987 #ifdef CONFIG_PROC_FS
2988         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2989         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2990 #endif
2991         return 0;
2992 }
2993
2994 static void __net_exit ip6_route_net_exit_late(struct net *net)
2995 {
2996 #ifdef CONFIG_PROC_FS
2997         proc_net_remove(net, "ipv6_route");
2998         proc_net_remove(net, "rt6_stats");
2999 #endif
3000 }
3001
3002 static struct pernet_operations ip6_route_net_ops = {
3003         .init = ip6_route_net_init,
3004         .exit = ip6_route_net_exit,
3005 };
3006
3007 static struct pernet_operations ip6_route_net_late_ops = {
3008         .init = ip6_route_net_init_late,
3009         .exit = ip6_route_net_exit_late,
3010 };
3011
3012 static struct notifier_block ip6_route_dev_notifier = {
3013         .notifier_call = ip6_route_dev_notify,
3014         .priority = 0,
3015 };
3016
3017 int __init ip6_route_init(void)
3018 {
3019         int ret;
3020
3021         ret = -ENOMEM;
3022         ip6_dst_ops_template.kmem_cachep =
3023                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3024                                   SLAB_HWCACHE_ALIGN, NULL);
3025         if (!ip6_dst_ops_template.kmem_cachep)
3026                 goto out;
3027
3028         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3029         if (ret)
3030                 goto out_kmem_cache;
3031
3032         ret = register_pernet_subsys(&ip6_route_net_ops);
3033         if (ret)
3034                 goto out_dst_entries;
3035
3036         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3037
3038         /* Registering of the loopback is done before this portion of code,
3039          * the loopback reference in rt6_info will not be taken, do it
3040          * manually for init_net */
3041         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3042         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3043   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3044         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3045         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3046         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3047         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3048   #endif
3049         ret = fib6_init();
3050         if (ret)
3051                 goto out_register_subsys;
3052
3053         ret = xfrm6_init();
3054         if (ret)
3055                 goto out_fib6_init;
3056
3057         ret = fib6_rules_init();
3058         if (ret)
3059                 goto xfrm6_init;
3060
3061         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3062         if (ret)
3063                 goto fib6_rules_init;
3064
3065         ret = -ENOBUFS;
3066         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3067             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3068             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3069                 goto out_register_late_subsys;
3070
3071         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3072         if (ret)
3073                 goto out_register_late_subsys;
3074
3075 out:
3076         return ret;
3077
3078 out_register_late_subsys:
3079         unregister_pernet_subsys(&ip6_route_net_late_ops);
3080 fib6_rules_init:
3081         fib6_rules_cleanup();
3082 xfrm6_init:
3083         xfrm6_fini();
3084 out_fib6_init:
3085         fib6_gc_cleanup();
3086 out_register_subsys:
3087         unregister_pernet_subsys(&ip6_route_net_ops);
3088 out_dst_entries:
3089         dst_entries_destroy(&ip6_dst_blackhole_ops);
3090 out_kmem_cache:
3091         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3092         goto out;
3093 }
3094
3095 void ip6_route_cleanup(void)
3096 {
3097         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3098         unregister_pernet_subsys(&ip6_route_net_late_ops);
3099         fib6_rules_cleanup();
3100         xfrm6_fini();
3101         fib6_gc_cleanup();
3102         unregister_pernet_subsys(&ip6_route_net_ops);
3103         dst_entries_destroy(&ip6_dst_blackhole_ops);
3104         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3105 }