]> git.karo-electronics.de Git - mv-sheeva.git/blob - net/ipv6/route.c
sysctl: ipv6: use correct net in ipv6_sysctl_rtcache_flush
[mv-sheeva.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static struct dst_ops ip6_dst_ops_template = {
101         .family                 =       AF_INET6,
102         .protocol               =       cpu_to_be16(ETH_P_IPV6),
103         .gc                     =       ip6_dst_gc,
104         .gc_thresh              =       1024,
105         .check                  =       ip6_dst_check,
106         .default_advmss         =       ip6_default_advmss,
107         .default_mtu            =       ip6_default_mtu,
108         .destroy                =       ip6_dst_destroy,
109         .ifdown                 =       ip6_dst_ifdown,
110         .negative_advice        =       ip6_negative_advice,
111         .link_failure           =       ip6_link_failure,
112         .update_pmtu            =       ip6_rt_update_pmtu,
113         .local_out              =       __ip6_local_out,
114 };
115
116 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
117 {
118         return 0;
119 }
120
121 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
122 {
123 }
124
125 static struct dst_ops ip6_dst_blackhole_ops = {
126         .family                 =       AF_INET6,
127         .protocol               =       cpu_to_be16(ETH_P_IPV6),
128         .destroy                =       ip6_dst_destroy,
129         .check                  =       ip6_dst_check,
130         .default_mtu            =       ip6_blackhole_default_mtu,
131         .default_advmss         =       ip6_default_advmss,
132         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
133 };
134
135 static struct rt6_info ip6_null_entry_template = {
136         .dst = {
137                 .__refcnt       = ATOMIC_INIT(1),
138                 .__use          = 1,
139                 .obsolete       = -1,
140                 .error          = -ENETUNREACH,
141                 .input          = ip6_pkt_discard,
142                 .output         = ip6_pkt_discard_out,
143         },
144         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
145         .rt6i_protocol  = RTPROT_KERNEL,
146         .rt6i_metric    = ~(u32) 0,
147         .rt6i_ref       = ATOMIC_INIT(1),
148 };
149
150 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
151
152 static int ip6_pkt_prohibit(struct sk_buff *skb);
153 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
154
155 static struct rt6_info ip6_prohibit_entry_template = {
156         .dst = {
157                 .__refcnt       = ATOMIC_INIT(1),
158                 .__use          = 1,
159                 .obsolete       = -1,
160                 .error          = -EACCES,
161                 .input          = ip6_pkt_prohibit,
162                 .output         = ip6_pkt_prohibit_out,
163         },
164         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
165         .rt6i_protocol  = RTPROT_KERNEL,
166         .rt6i_metric    = ~(u32) 0,
167         .rt6i_ref       = ATOMIC_INIT(1),
168 };
169
170 static struct rt6_info ip6_blk_hole_entry_template = {
171         .dst = {
172                 .__refcnt       = ATOMIC_INIT(1),
173                 .__use          = 1,
174                 .obsolete       = -1,
175                 .error          = -EINVAL,
176                 .input          = dst_discard,
177                 .output         = dst_discard,
178         },
179         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
180         .rt6i_protocol  = RTPROT_KERNEL,
181         .rt6i_metric    = ~(u32) 0,
182         .rt6i_ref       = ATOMIC_INIT(1),
183 };
184
185 #endif
186
187 /* allocate dst with ip6_dst_ops */
188 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
189 {
190         return (struct rt6_info *)dst_alloc(ops);
191 }
192
193 static void ip6_dst_destroy(struct dst_entry *dst)
194 {
195         struct rt6_info *rt = (struct rt6_info *)dst;
196         struct inet6_dev *idev = rt->rt6i_idev;
197         struct inet_peer *peer = rt->rt6i_peer;
198
199         if (idev != NULL) {
200                 rt->rt6i_idev = NULL;
201                 in6_dev_put(idev);
202         }
203         if (peer) {
204                 rt->rt6i_peer = NULL;
205                 inet_putpeer(peer);
206         }
207 }
208
209 void rt6_bind_peer(struct rt6_info *rt, int create)
210 {
211         struct inet_peer *peer;
212
213         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
214         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
215                 inet_putpeer(peer);
216 }
217
218 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
219                            int how)
220 {
221         struct rt6_info *rt = (struct rt6_info *)dst;
222         struct inet6_dev *idev = rt->rt6i_idev;
223         struct net_device *loopback_dev =
224                 dev_net(dev)->loopback_dev;
225
226         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
227                 struct inet6_dev *loopback_idev =
228                         in6_dev_get(loopback_dev);
229                 if (loopback_idev != NULL) {
230                         rt->rt6i_idev = loopback_idev;
231                         in6_dev_put(idev);
232                 }
233         }
234 }
235
236 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
237 {
238         return (rt->rt6i_flags & RTF_EXPIRES) &&
239                 time_after(jiffies, rt->rt6i_expires);
240 }
241
242 static inline int rt6_need_strict(struct in6_addr *daddr)
243 {
244         return ipv6_addr_type(daddr) &
245                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
246 }
247
248 /*
249  *      Route lookup. Any table->tb6_lock is implied.
250  */
251
252 static inline struct rt6_info *rt6_device_match(struct net *net,
253                                                     struct rt6_info *rt,
254                                                     struct in6_addr *saddr,
255                                                     int oif,
256                                                     int flags)
257 {
258         struct rt6_info *local = NULL;
259         struct rt6_info *sprt;
260
261         if (!oif && ipv6_addr_any(saddr))
262                 goto out;
263
264         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
265                 struct net_device *dev = sprt->rt6i_dev;
266
267                 if (oif) {
268                         if (dev->ifindex == oif)
269                                 return sprt;
270                         if (dev->flags & IFF_LOOPBACK) {
271                                 if (sprt->rt6i_idev == NULL ||
272                                     sprt->rt6i_idev->dev->ifindex != oif) {
273                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
274                                                 continue;
275                                         if (local && (!oif ||
276                                                       local->rt6i_idev->dev->ifindex == oif))
277                                                 continue;
278                                 }
279                                 local = sprt;
280                         }
281                 } else {
282                         if (ipv6_chk_addr(net, saddr, dev,
283                                           flags & RT6_LOOKUP_F_IFACE))
284                                 return sprt;
285                 }
286         }
287
288         if (oif) {
289                 if (local)
290                         return local;
291
292                 if (flags & RT6_LOOKUP_F_IFACE)
293                         return net->ipv6.ip6_null_entry;
294         }
295 out:
296         return rt;
297 }
298
299 #ifdef CONFIG_IPV6_ROUTER_PREF
300 static void rt6_probe(struct rt6_info *rt)
301 {
302         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
303         /*
304          * Okay, this does not seem to be appropriate
305          * for now, however, we need to check if it
306          * is really so; aka Router Reachability Probing.
307          *
308          * Router Reachability Probe MUST be rate-limited
309          * to no more than one per minute.
310          */
311         if (!neigh || (neigh->nud_state & NUD_VALID))
312                 return;
313         read_lock_bh(&neigh->lock);
314         if (!(neigh->nud_state & NUD_VALID) &&
315             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
316                 struct in6_addr mcaddr;
317                 struct in6_addr *target;
318
319                 neigh->updated = jiffies;
320                 read_unlock_bh(&neigh->lock);
321
322                 target = (struct in6_addr *)&neigh->primary_key;
323                 addrconf_addr_solict_mult(target, &mcaddr);
324                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
325         } else
326                 read_unlock_bh(&neigh->lock);
327 }
328 #else
329 static inline void rt6_probe(struct rt6_info *rt)
330 {
331 }
332 #endif
333
334 /*
335  * Default Router Selection (RFC 2461 6.3.6)
336  */
337 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
338 {
339         struct net_device *dev = rt->rt6i_dev;
340         if (!oif || dev->ifindex == oif)
341                 return 2;
342         if ((dev->flags & IFF_LOOPBACK) &&
343             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
344                 return 1;
345         return 0;
346 }
347
348 static inline int rt6_check_neigh(struct rt6_info *rt)
349 {
350         struct neighbour *neigh = rt->rt6i_nexthop;
351         int m;
352         if (rt->rt6i_flags & RTF_NONEXTHOP ||
353             !(rt->rt6i_flags & RTF_GATEWAY))
354                 m = 1;
355         else if (neigh) {
356                 read_lock_bh(&neigh->lock);
357                 if (neigh->nud_state & NUD_VALID)
358                         m = 2;
359 #ifdef CONFIG_IPV6_ROUTER_PREF
360                 else if (neigh->nud_state & NUD_FAILED)
361                         m = 0;
362 #endif
363                 else
364                         m = 1;
365                 read_unlock_bh(&neigh->lock);
366         } else
367                 m = 0;
368         return m;
369 }
370
371 static int rt6_score_route(struct rt6_info *rt, int oif,
372                            int strict)
373 {
374         int m, n;
375
376         m = rt6_check_dev(rt, oif);
377         if (!m && (strict & RT6_LOOKUP_F_IFACE))
378                 return -1;
379 #ifdef CONFIG_IPV6_ROUTER_PREF
380         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
381 #endif
382         n = rt6_check_neigh(rt);
383         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
384                 return -1;
385         return m;
386 }
387
388 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
389                                    int *mpri, struct rt6_info *match)
390 {
391         int m;
392
393         if (rt6_check_expired(rt))
394                 goto out;
395
396         m = rt6_score_route(rt, oif, strict);
397         if (m < 0)
398                 goto out;
399
400         if (m > *mpri) {
401                 if (strict & RT6_LOOKUP_F_REACHABLE)
402                         rt6_probe(match);
403                 *mpri = m;
404                 match = rt;
405         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
406                 rt6_probe(rt);
407         }
408
409 out:
410         return match;
411 }
412
413 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
414                                      struct rt6_info *rr_head,
415                                      u32 metric, int oif, int strict)
416 {
417         struct rt6_info *rt, *match;
418         int mpri = -1;
419
420         match = NULL;
421         for (rt = rr_head; rt && rt->rt6i_metric == metric;
422              rt = rt->dst.rt6_next)
423                 match = find_match(rt, oif, strict, &mpri, match);
424         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
425              rt = rt->dst.rt6_next)
426                 match = find_match(rt, oif, strict, &mpri, match);
427
428         return match;
429 }
430
431 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
432 {
433         struct rt6_info *match, *rt0;
434         struct net *net;
435
436         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
437                   __func__, fn->leaf, oif);
438
439         rt0 = fn->rr_ptr;
440         if (!rt0)
441                 fn->rr_ptr = rt0 = fn->leaf;
442
443         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
444
445         if (!match &&
446             (strict & RT6_LOOKUP_F_REACHABLE)) {
447                 struct rt6_info *next = rt0->dst.rt6_next;
448
449                 /* no entries matched; do round-robin */
450                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
451                         next = fn->leaf;
452
453                 if (next != rt0)
454                         fn->rr_ptr = next;
455         }
456
457         RT6_TRACE("%s() => %p\n",
458                   __func__, match);
459
460         net = dev_net(rt0->rt6i_dev);
461         return match ? match : net->ipv6.ip6_null_entry;
462 }
463
464 #ifdef CONFIG_IPV6_ROUTE_INFO
465 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
466                   struct in6_addr *gwaddr)
467 {
468         struct net *net = dev_net(dev);
469         struct route_info *rinfo = (struct route_info *) opt;
470         struct in6_addr prefix_buf, *prefix;
471         unsigned int pref;
472         unsigned long lifetime;
473         struct rt6_info *rt;
474
475         if (len < sizeof(struct route_info)) {
476                 return -EINVAL;
477         }
478
479         /* Sanity check for prefix_len and length */
480         if (rinfo->length > 3) {
481                 return -EINVAL;
482         } else if (rinfo->prefix_len > 128) {
483                 return -EINVAL;
484         } else if (rinfo->prefix_len > 64) {
485                 if (rinfo->length < 2) {
486                         return -EINVAL;
487                 }
488         } else if (rinfo->prefix_len > 0) {
489                 if (rinfo->length < 1) {
490                         return -EINVAL;
491                 }
492         }
493
494         pref = rinfo->route_pref;
495         if (pref == ICMPV6_ROUTER_PREF_INVALID)
496                 return -EINVAL;
497
498         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
499
500         if (rinfo->length == 3)
501                 prefix = (struct in6_addr *)rinfo->prefix;
502         else {
503                 /* this function is safe */
504                 ipv6_addr_prefix(&prefix_buf,
505                                  (struct in6_addr *)rinfo->prefix,
506                                  rinfo->prefix_len);
507                 prefix = &prefix_buf;
508         }
509
510         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
511                                 dev->ifindex);
512
513         if (rt && !lifetime) {
514                 ip6_del_rt(rt);
515                 rt = NULL;
516         }
517
518         if (!rt && lifetime)
519                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
520                                         pref);
521         else if (rt)
522                 rt->rt6i_flags = RTF_ROUTEINFO |
523                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
524
525         if (rt) {
526                 if (!addrconf_finite_timeout(lifetime)) {
527                         rt->rt6i_flags &= ~RTF_EXPIRES;
528                 } else {
529                         rt->rt6i_expires = jiffies + HZ * lifetime;
530                         rt->rt6i_flags |= RTF_EXPIRES;
531                 }
532                 dst_release(&rt->dst);
533         }
534         return 0;
535 }
536 #endif
537
538 #define BACKTRACK(__net, saddr)                 \
539 do { \
540         if (rt == __net->ipv6.ip6_null_entry) { \
541                 struct fib6_node *pn; \
542                 while (1) { \
543                         if (fn->fn_flags & RTN_TL_ROOT) \
544                                 goto out; \
545                         pn = fn->parent; \
546                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
547                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
548                         else \
549                                 fn = pn; \
550                         if (fn->fn_flags & RTN_RTINFO) \
551                                 goto restart; \
552                 } \
553         } \
554 } while(0)
555
556 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
557                                              struct fib6_table *table,
558                                              struct flowi *fl, int flags)
559 {
560         struct fib6_node *fn;
561         struct rt6_info *rt;
562
563         read_lock_bh(&table->tb6_lock);
564         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
565 restart:
566         rt = fn->leaf;
567         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
568         BACKTRACK(net, &fl->fl6_src);
569 out:
570         dst_use(&rt->dst, jiffies);
571         read_unlock_bh(&table->tb6_lock);
572         return rt;
573
574 }
575
576 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
577                             const struct in6_addr *saddr, int oif, int strict)
578 {
579         struct flowi fl = {
580                 .oif = oif,
581                 .fl6_dst = *daddr,
582         };
583         struct dst_entry *dst;
584         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
585
586         if (saddr) {
587                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
588                 flags |= RT6_LOOKUP_F_HAS_SADDR;
589         }
590
591         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
592         if (dst->error == 0)
593                 return (struct rt6_info *) dst;
594
595         dst_release(dst);
596
597         return NULL;
598 }
599
600 EXPORT_SYMBOL(rt6_lookup);
601
602 /* ip6_ins_rt is called with FREE table->tb6_lock.
603    It takes new route entry, the addition fails by any reason the
604    route is freed. In any case, if caller does not hold it, it may
605    be destroyed.
606  */
607
608 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
609 {
610         int err;
611         struct fib6_table *table;
612
613         table = rt->rt6i_table;
614         write_lock_bh(&table->tb6_lock);
615         err = fib6_add(&table->tb6_root, rt, info);
616         write_unlock_bh(&table->tb6_lock);
617
618         return err;
619 }
620
621 int ip6_ins_rt(struct rt6_info *rt)
622 {
623         struct nl_info info = {
624                 .nl_net = dev_net(rt->rt6i_dev),
625         };
626         return __ip6_ins_rt(rt, &info);
627 }
628
629 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
630                                       struct in6_addr *saddr)
631 {
632         struct rt6_info *rt;
633
634         /*
635          *      Clone the route.
636          */
637
638         rt = ip6_rt_copy(ort);
639
640         if (rt) {
641                 struct neighbour *neigh;
642                 int attempts = !in_softirq();
643
644                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
645                         if (rt->rt6i_dst.plen != 128 &&
646                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
647                                 rt->rt6i_flags |= RTF_ANYCAST;
648                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
649                 }
650
651                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
652                 rt->rt6i_dst.plen = 128;
653                 rt->rt6i_flags |= RTF_CACHE;
654                 rt->dst.flags |= DST_HOST;
655
656 #ifdef CONFIG_IPV6_SUBTREES
657                 if (rt->rt6i_src.plen && saddr) {
658                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
659                         rt->rt6i_src.plen = 128;
660                 }
661 #endif
662
663         retry:
664                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
665                 if (IS_ERR(neigh)) {
666                         struct net *net = dev_net(rt->rt6i_dev);
667                         int saved_rt_min_interval =
668                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
669                         int saved_rt_elasticity =
670                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
671
672                         if (attempts-- > 0) {
673                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
674                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
675
676                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
677
678                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
679                                         saved_rt_elasticity;
680                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
681                                         saved_rt_min_interval;
682                                 goto retry;
683                         }
684
685                         if (net_ratelimit())
686                                 printk(KERN_WARNING
687                                        "ipv6: Neighbour table overflow.\n");
688                         dst_free(&rt->dst);
689                         return NULL;
690                 }
691                 rt->rt6i_nexthop = neigh;
692
693         }
694
695         return rt;
696 }
697
698 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
699 {
700         struct rt6_info *rt = ip6_rt_copy(ort);
701         if (rt) {
702                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
703                 rt->rt6i_dst.plen = 128;
704                 rt->rt6i_flags |= RTF_CACHE;
705                 rt->dst.flags |= DST_HOST;
706                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
707         }
708         return rt;
709 }
710
711 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
712                                       struct flowi *fl, int flags)
713 {
714         struct fib6_node *fn;
715         struct rt6_info *rt, *nrt;
716         int strict = 0;
717         int attempts = 3;
718         int err;
719         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
720
721         strict |= flags & RT6_LOOKUP_F_IFACE;
722
723 relookup:
724         read_lock_bh(&table->tb6_lock);
725
726 restart_2:
727         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
728
729 restart:
730         rt = rt6_select(fn, oif, strict | reachable);
731
732         BACKTRACK(net, &fl->fl6_src);
733         if (rt == net->ipv6.ip6_null_entry ||
734             rt->rt6i_flags & RTF_CACHE)
735                 goto out;
736
737         dst_hold(&rt->dst);
738         read_unlock_bh(&table->tb6_lock);
739
740         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
741                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
742         else
743                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
744
745         dst_release(&rt->dst);
746         rt = nrt ? : net->ipv6.ip6_null_entry;
747
748         dst_hold(&rt->dst);
749         if (nrt) {
750                 err = ip6_ins_rt(nrt);
751                 if (!err)
752                         goto out2;
753         }
754
755         if (--attempts <= 0)
756                 goto out2;
757
758         /*
759          * Race condition! In the gap, when table->tb6_lock was
760          * released someone could insert this route.  Relookup.
761          */
762         dst_release(&rt->dst);
763         goto relookup;
764
765 out:
766         if (reachable) {
767                 reachable = 0;
768                 goto restart_2;
769         }
770         dst_hold(&rt->dst);
771         read_unlock_bh(&table->tb6_lock);
772 out2:
773         rt->dst.lastuse = jiffies;
774         rt->dst.__use++;
775
776         return rt;
777 }
778
779 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
780                                             struct flowi *fl, int flags)
781 {
782         return ip6_pol_route(net, table, fl->iif, fl, flags);
783 }
784
785 void ip6_route_input(struct sk_buff *skb)
786 {
787         struct ipv6hdr *iph = ipv6_hdr(skb);
788         struct net *net = dev_net(skb->dev);
789         int flags = RT6_LOOKUP_F_HAS_SADDR;
790         struct flowi fl = {
791                 .iif = skb->dev->ifindex,
792                 .fl6_dst = iph->daddr,
793                 .fl6_src = iph->saddr,
794                 .fl6_flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
795                 .mark = skb->mark,
796                 .proto = iph->nexthdr,
797         };
798
799         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
800                 flags |= RT6_LOOKUP_F_IFACE;
801
802         skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
803 }
804
805 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
806                                              struct flowi *fl, int flags)
807 {
808         return ip6_pol_route(net, table, fl->oif, fl, flags);
809 }
810
811 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
812                                     struct flowi *fl)
813 {
814         int flags = 0;
815
816         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
817                 flags |= RT6_LOOKUP_F_IFACE;
818
819         if (!ipv6_addr_any(&fl->fl6_src))
820                 flags |= RT6_LOOKUP_F_HAS_SADDR;
821         else if (sk)
822                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
823
824         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
825 }
826
827 EXPORT_SYMBOL(ip6_route_output);
828
829 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
830 {
831         struct rt6_info *ort = (struct rt6_info *) *dstp;
832         struct rt6_info *rt = (struct rt6_info *)
833                 dst_alloc(&ip6_dst_blackhole_ops);
834         struct dst_entry *new = NULL;
835
836         if (rt) {
837                 new = &rt->dst;
838
839                 atomic_set(&new->__refcnt, 1);
840                 new->__use = 1;
841                 new->input = dst_discard;
842                 new->output = dst_discard;
843
844                 dst_copy_metrics(new, &ort->dst);
845                 new->dev = ort->dst.dev;
846                 if (new->dev)
847                         dev_hold(new->dev);
848                 rt->rt6i_idev = ort->rt6i_idev;
849                 if (rt->rt6i_idev)
850                         in6_dev_hold(rt->rt6i_idev);
851                 rt->rt6i_expires = 0;
852
853                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
854                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
855                 rt->rt6i_metric = 0;
856
857                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
858 #ifdef CONFIG_IPV6_SUBTREES
859                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
860 #endif
861
862                 dst_free(new);
863         }
864
865         dst_release(*dstp);
866         *dstp = new;
867         return new ? 0 : -ENOMEM;
868 }
869 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
870
871 /*
872  *      Destination cache support functions
873  */
874
875 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
876 {
877         struct rt6_info *rt;
878
879         rt = (struct rt6_info *) dst;
880
881         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
882                 return dst;
883
884         return NULL;
885 }
886
887 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
888 {
889         struct rt6_info *rt = (struct rt6_info *) dst;
890
891         if (rt) {
892                 if (rt->rt6i_flags & RTF_CACHE) {
893                         if (rt6_check_expired(rt)) {
894                                 ip6_del_rt(rt);
895                                 dst = NULL;
896                         }
897                 } else {
898                         dst_release(dst);
899                         dst = NULL;
900                 }
901         }
902         return dst;
903 }
904
905 static void ip6_link_failure(struct sk_buff *skb)
906 {
907         struct rt6_info *rt;
908
909         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
910
911         rt = (struct rt6_info *) skb_dst(skb);
912         if (rt) {
913                 if (rt->rt6i_flags&RTF_CACHE) {
914                         dst_set_expires(&rt->dst, 0);
915                         rt->rt6i_flags |= RTF_EXPIRES;
916                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
917                         rt->rt6i_node->fn_sernum = -1;
918         }
919 }
920
921 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
922 {
923         struct rt6_info *rt6 = (struct rt6_info*)dst;
924
925         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
926                 rt6->rt6i_flags |= RTF_MODIFIED;
927                 if (mtu < IPV6_MIN_MTU) {
928                         u32 features = dst_metric(dst, RTAX_FEATURES);
929                         mtu = IPV6_MIN_MTU;
930                         features |= RTAX_FEATURE_ALLFRAG;
931                         dst_metric_set(dst, RTAX_FEATURES, features);
932                 }
933                 dst_metric_set(dst, RTAX_MTU, mtu);
934                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
935         }
936 }
937
938 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
939 {
940         struct net_device *dev = dst->dev;
941         unsigned int mtu = dst_mtu(dst);
942         struct net *net = dev_net(dev);
943
944         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
945
946         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
947                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
948
949         /*
950          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
951          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
952          * IPV6_MAXPLEN is also valid and means: "any MSS,
953          * rely only on pmtu discovery"
954          */
955         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
956                 mtu = IPV6_MAXPLEN;
957         return mtu;
958 }
959
960 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
961 {
962         unsigned int mtu = IPV6_MIN_MTU;
963         struct inet6_dev *idev;
964
965         rcu_read_lock();
966         idev = __in6_dev_get(dst->dev);
967         if (idev)
968                 mtu = idev->cnf.mtu6;
969         rcu_read_unlock();
970
971         return mtu;
972 }
973
974 static struct dst_entry *icmp6_dst_gc_list;
975 static DEFINE_SPINLOCK(icmp6_dst_lock);
976
977 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
978                                   struct neighbour *neigh,
979                                   const struct in6_addr *addr)
980 {
981         struct rt6_info *rt;
982         struct inet6_dev *idev = in6_dev_get(dev);
983         struct net *net = dev_net(dev);
984
985         if (unlikely(idev == NULL))
986                 return NULL;
987
988         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
989         if (unlikely(rt == NULL)) {
990                 in6_dev_put(idev);
991                 goto out;
992         }
993
994         dev_hold(dev);
995         if (neigh)
996                 neigh_hold(neigh);
997         else {
998                 neigh = ndisc_get_neigh(dev, addr);
999                 if (IS_ERR(neigh))
1000                         neigh = NULL;
1001         }
1002
1003         rt->rt6i_dev      = dev;
1004         rt->rt6i_idev     = idev;
1005         rt->rt6i_nexthop  = neigh;
1006         atomic_set(&rt->dst.__refcnt, 1);
1007         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1008         rt->dst.output  = ip6_output;
1009
1010 #if 0   /* there's no chance to use these for ndisc */
1011         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1012                                 ? DST_HOST
1013                                 : 0;
1014         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1015         rt->rt6i_dst.plen = 128;
1016 #endif
1017
1018         spin_lock_bh(&icmp6_dst_lock);
1019         rt->dst.next = icmp6_dst_gc_list;
1020         icmp6_dst_gc_list = &rt->dst;
1021         spin_unlock_bh(&icmp6_dst_lock);
1022
1023         fib6_force_start_gc(net);
1024
1025 out:
1026         return &rt->dst;
1027 }
1028
1029 int icmp6_dst_gc(void)
1030 {
1031         struct dst_entry *dst, *next, **pprev;
1032         int more = 0;
1033
1034         next = NULL;
1035
1036         spin_lock_bh(&icmp6_dst_lock);
1037         pprev = &icmp6_dst_gc_list;
1038
1039         while ((dst = *pprev) != NULL) {
1040                 if (!atomic_read(&dst->__refcnt)) {
1041                         *pprev = dst->next;
1042                         dst_free(dst);
1043                 } else {
1044                         pprev = &dst->next;
1045                         ++more;
1046                 }
1047         }
1048
1049         spin_unlock_bh(&icmp6_dst_lock);
1050
1051         return more;
1052 }
1053
1054 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1055                             void *arg)
1056 {
1057         struct dst_entry *dst, **pprev;
1058
1059         spin_lock_bh(&icmp6_dst_lock);
1060         pprev = &icmp6_dst_gc_list;
1061         while ((dst = *pprev) != NULL) {
1062                 struct rt6_info *rt = (struct rt6_info *) dst;
1063                 if (func(rt, arg)) {
1064                         *pprev = dst->next;
1065                         dst_free(dst);
1066                 } else {
1067                         pprev = &dst->next;
1068                 }
1069         }
1070         spin_unlock_bh(&icmp6_dst_lock);
1071 }
1072
1073 static int ip6_dst_gc(struct dst_ops *ops)
1074 {
1075         unsigned long now = jiffies;
1076         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1077         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1078         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1079         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1080         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1081         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1082         int entries;
1083
1084         entries = dst_entries_get_fast(ops);
1085         if (time_after(rt_last_gc + rt_min_interval, now) &&
1086             entries <= rt_max_size)
1087                 goto out;
1088
1089         net->ipv6.ip6_rt_gc_expire++;
1090         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1091         net->ipv6.ip6_rt_last_gc = now;
1092         entries = dst_entries_get_slow(ops);
1093         if (entries < ops->gc_thresh)
1094                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1095 out:
1096         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1097         return entries > rt_max_size;
1098 }
1099
1100 /* Clean host part of a prefix. Not necessary in radix tree,
1101    but results in cleaner routing tables.
1102
1103    Remove it only when all the things will work!
1104  */
1105
1106 int ip6_dst_hoplimit(struct dst_entry *dst)
1107 {
1108         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1109         if (hoplimit == 0) {
1110                 struct net_device *dev = dst->dev;
1111                 struct inet6_dev *idev;
1112
1113                 rcu_read_lock();
1114                 idev = __in6_dev_get(dev);
1115                 if (idev)
1116                         hoplimit = idev->cnf.hop_limit;
1117                 else
1118                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1119                 rcu_read_unlock();
1120         }
1121         return hoplimit;
1122 }
1123 EXPORT_SYMBOL(ip6_dst_hoplimit);
1124
1125 /*
1126  *
1127  */
1128
1129 int ip6_route_add(struct fib6_config *cfg)
1130 {
1131         int err;
1132         struct net *net = cfg->fc_nlinfo.nl_net;
1133         struct rt6_info *rt = NULL;
1134         struct net_device *dev = NULL;
1135         struct inet6_dev *idev = NULL;
1136         struct fib6_table *table;
1137         int addr_type;
1138
1139         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1140                 return -EINVAL;
1141 #ifndef CONFIG_IPV6_SUBTREES
1142         if (cfg->fc_src_len)
1143                 return -EINVAL;
1144 #endif
1145         if (cfg->fc_ifindex) {
1146                 err = -ENODEV;
1147                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1148                 if (!dev)
1149                         goto out;
1150                 idev = in6_dev_get(dev);
1151                 if (!idev)
1152                         goto out;
1153         }
1154
1155         if (cfg->fc_metric == 0)
1156                 cfg->fc_metric = IP6_RT_PRIO_USER;
1157
1158         table = fib6_new_table(net, cfg->fc_table);
1159         if (table == NULL) {
1160                 err = -ENOBUFS;
1161                 goto out;
1162         }
1163
1164         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1165
1166         if (rt == NULL) {
1167                 err = -ENOMEM;
1168                 goto out;
1169         }
1170
1171         rt->dst.obsolete = -1;
1172         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1173                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1174                                 0;
1175
1176         if (cfg->fc_protocol == RTPROT_UNSPEC)
1177                 cfg->fc_protocol = RTPROT_BOOT;
1178         rt->rt6i_protocol = cfg->fc_protocol;
1179
1180         addr_type = ipv6_addr_type(&cfg->fc_dst);
1181
1182         if (addr_type & IPV6_ADDR_MULTICAST)
1183                 rt->dst.input = ip6_mc_input;
1184         else if (cfg->fc_flags & RTF_LOCAL)
1185                 rt->dst.input = ip6_input;
1186         else
1187                 rt->dst.input = ip6_forward;
1188
1189         rt->dst.output = ip6_output;
1190
1191         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1192         rt->rt6i_dst.plen = cfg->fc_dst_len;
1193         if (rt->rt6i_dst.plen == 128)
1194                rt->dst.flags = DST_HOST;
1195
1196 #ifdef CONFIG_IPV6_SUBTREES
1197         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1198         rt->rt6i_src.plen = cfg->fc_src_len;
1199 #endif
1200
1201         rt->rt6i_metric = cfg->fc_metric;
1202
1203         /* We cannot add true routes via loopback here,
1204            they would result in kernel looping; promote them to reject routes
1205          */
1206         if ((cfg->fc_flags & RTF_REJECT) ||
1207             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1208                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1209                 /* hold loopback dev/idev if we haven't done so. */
1210                 if (dev != net->loopback_dev) {
1211                         if (dev) {
1212                                 dev_put(dev);
1213                                 in6_dev_put(idev);
1214                         }
1215                         dev = net->loopback_dev;
1216                         dev_hold(dev);
1217                         idev = in6_dev_get(dev);
1218                         if (!idev) {
1219                                 err = -ENODEV;
1220                                 goto out;
1221                         }
1222                 }
1223                 rt->dst.output = ip6_pkt_discard_out;
1224                 rt->dst.input = ip6_pkt_discard;
1225                 rt->dst.error = -ENETUNREACH;
1226                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1227                 goto install_route;
1228         }
1229
1230         if (cfg->fc_flags & RTF_GATEWAY) {
1231                 struct in6_addr *gw_addr;
1232                 int gwa_type;
1233
1234                 gw_addr = &cfg->fc_gateway;
1235                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1236                 gwa_type = ipv6_addr_type(gw_addr);
1237
1238                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1239                         struct rt6_info *grt;
1240
1241                         /* IPv6 strictly inhibits using not link-local
1242                            addresses as nexthop address.
1243                            Otherwise, router will not able to send redirects.
1244                            It is very good, but in some (rare!) circumstances
1245                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1246                            some exceptions. --ANK
1247                          */
1248                         err = -EINVAL;
1249                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1250                                 goto out;
1251
1252                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1253
1254                         err = -EHOSTUNREACH;
1255                         if (grt == NULL)
1256                                 goto out;
1257                         if (dev) {
1258                                 if (dev != grt->rt6i_dev) {
1259                                         dst_release(&grt->dst);
1260                                         goto out;
1261                                 }
1262                         } else {
1263                                 dev = grt->rt6i_dev;
1264                                 idev = grt->rt6i_idev;
1265                                 dev_hold(dev);
1266                                 in6_dev_hold(grt->rt6i_idev);
1267                         }
1268                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1269                                 err = 0;
1270                         dst_release(&grt->dst);
1271
1272                         if (err)
1273                                 goto out;
1274                 }
1275                 err = -EINVAL;
1276                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1277                         goto out;
1278         }
1279
1280         err = -ENODEV;
1281         if (dev == NULL)
1282                 goto out;
1283
1284         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1285                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1286                 if (IS_ERR(rt->rt6i_nexthop)) {
1287                         err = PTR_ERR(rt->rt6i_nexthop);
1288                         rt->rt6i_nexthop = NULL;
1289                         goto out;
1290                 }
1291         }
1292
1293         rt->rt6i_flags = cfg->fc_flags;
1294
1295 install_route:
1296         if (cfg->fc_mx) {
1297                 struct nlattr *nla;
1298                 int remaining;
1299
1300                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1301                         int type = nla_type(nla);
1302
1303                         if (type) {
1304                                 if (type > RTAX_MAX) {
1305                                         err = -EINVAL;
1306                                         goto out;
1307                                 }
1308
1309                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1310                         }
1311                 }
1312         }
1313
1314         rt->dst.dev = dev;
1315         rt->rt6i_idev = idev;
1316         rt->rt6i_table = table;
1317
1318         cfg->fc_nlinfo.nl_net = dev_net(dev);
1319
1320         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1321
1322 out:
1323         if (dev)
1324                 dev_put(dev);
1325         if (idev)
1326                 in6_dev_put(idev);
1327         if (rt)
1328                 dst_free(&rt->dst);
1329         return err;
1330 }
1331
1332 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1333 {
1334         int err;
1335         struct fib6_table *table;
1336         struct net *net = dev_net(rt->rt6i_dev);
1337
1338         if (rt == net->ipv6.ip6_null_entry)
1339                 return -ENOENT;
1340
1341         table = rt->rt6i_table;
1342         write_lock_bh(&table->tb6_lock);
1343
1344         err = fib6_del(rt, info);
1345         dst_release(&rt->dst);
1346
1347         write_unlock_bh(&table->tb6_lock);
1348
1349         return err;
1350 }
1351
1352 int ip6_del_rt(struct rt6_info *rt)
1353 {
1354         struct nl_info info = {
1355                 .nl_net = dev_net(rt->rt6i_dev),
1356         };
1357         return __ip6_del_rt(rt, &info);
1358 }
1359
1360 static int ip6_route_del(struct fib6_config *cfg)
1361 {
1362         struct fib6_table *table;
1363         struct fib6_node *fn;
1364         struct rt6_info *rt;
1365         int err = -ESRCH;
1366
1367         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1368         if (table == NULL)
1369                 return err;
1370
1371         read_lock_bh(&table->tb6_lock);
1372
1373         fn = fib6_locate(&table->tb6_root,
1374                          &cfg->fc_dst, cfg->fc_dst_len,
1375                          &cfg->fc_src, cfg->fc_src_len);
1376
1377         if (fn) {
1378                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1379                         if (cfg->fc_ifindex &&
1380                             (rt->rt6i_dev == NULL ||
1381                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1382                                 continue;
1383                         if (cfg->fc_flags & RTF_GATEWAY &&
1384                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1385                                 continue;
1386                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1387                                 continue;
1388                         dst_hold(&rt->dst);
1389                         read_unlock_bh(&table->tb6_lock);
1390
1391                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1392                 }
1393         }
1394         read_unlock_bh(&table->tb6_lock);
1395
1396         return err;
1397 }
1398
1399 /*
1400  *      Handle redirects
1401  */
1402 struct ip6rd_flowi {
1403         struct flowi fl;
1404         struct in6_addr gateway;
1405 };
1406
1407 static struct rt6_info *__ip6_route_redirect(struct net *net,
1408                                              struct fib6_table *table,
1409                                              struct flowi *fl,
1410                                              int flags)
1411 {
1412         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1413         struct rt6_info *rt;
1414         struct fib6_node *fn;
1415
1416         /*
1417          * Get the "current" route for this destination and
1418          * check if the redirect has come from approriate router.
1419          *
1420          * RFC 2461 specifies that redirects should only be
1421          * accepted if they come from the nexthop to the target.
1422          * Due to the way the routes are chosen, this notion
1423          * is a bit fuzzy and one might need to check all possible
1424          * routes.
1425          */
1426
1427         read_lock_bh(&table->tb6_lock);
1428         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1429 restart:
1430         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1431                 /*
1432                  * Current route is on-link; redirect is always invalid.
1433                  *
1434                  * Seems, previous statement is not true. It could
1435                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1436                  * But then router serving it might decide, that we should
1437                  * know truth 8)8) --ANK (980726).
1438                  */
1439                 if (rt6_check_expired(rt))
1440                         continue;
1441                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1442                         continue;
1443                 if (fl->oif != rt->rt6i_dev->ifindex)
1444                         continue;
1445                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1446                         continue;
1447                 break;
1448         }
1449
1450         if (!rt)
1451                 rt = net->ipv6.ip6_null_entry;
1452         BACKTRACK(net, &fl->fl6_src);
1453 out:
1454         dst_hold(&rt->dst);
1455
1456         read_unlock_bh(&table->tb6_lock);
1457
1458         return rt;
1459 };
1460
1461 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1462                                            struct in6_addr *src,
1463                                            struct in6_addr *gateway,
1464                                            struct net_device *dev)
1465 {
1466         int flags = RT6_LOOKUP_F_HAS_SADDR;
1467         struct net *net = dev_net(dev);
1468         struct ip6rd_flowi rdfl = {
1469                 .fl = {
1470                         .oif = dev->ifindex,
1471                         .fl6_dst = *dest,
1472                         .fl6_src = *src,
1473                 },
1474         };
1475
1476         ipv6_addr_copy(&rdfl.gateway, gateway);
1477
1478         if (rt6_need_strict(dest))
1479                 flags |= RT6_LOOKUP_F_IFACE;
1480
1481         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1482                                                    flags, __ip6_route_redirect);
1483 }
1484
1485 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1486                   struct in6_addr *saddr,
1487                   struct neighbour *neigh, u8 *lladdr, int on_link)
1488 {
1489         struct rt6_info *rt, *nrt = NULL;
1490         struct netevent_redirect netevent;
1491         struct net *net = dev_net(neigh->dev);
1492
1493         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1494
1495         if (rt == net->ipv6.ip6_null_entry) {
1496                 if (net_ratelimit())
1497                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1498                                "for redirect target\n");
1499                 goto out;
1500         }
1501
1502         /*
1503          *      We have finally decided to accept it.
1504          */
1505
1506         neigh_update(neigh, lladdr, NUD_STALE,
1507                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1508                      NEIGH_UPDATE_F_OVERRIDE|
1509                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1510                                      NEIGH_UPDATE_F_ISROUTER))
1511                      );
1512
1513         /*
1514          * Redirect received -> path was valid.
1515          * Look, redirects are sent only in response to data packets,
1516          * so that this nexthop apparently is reachable. --ANK
1517          */
1518         dst_confirm(&rt->dst);
1519
1520         /* Duplicate redirect: silently ignore. */
1521         if (neigh == rt->dst.neighbour)
1522                 goto out;
1523
1524         nrt = ip6_rt_copy(rt);
1525         if (nrt == NULL)
1526                 goto out;
1527
1528         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1529         if (on_link)
1530                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1531
1532         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1533         nrt->rt6i_dst.plen = 128;
1534         nrt->dst.flags |= DST_HOST;
1535
1536         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1537         nrt->rt6i_nexthop = neigh_clone(neigh);
1538
1539         if (ip6_ins_rt(nrt))
1540                 goto out;
1541
1542         netevent.old = &rt->dst;
1543         netevent.new = &nrt->dst;
1544         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1545
1546         if (rt->rt6i_flags&RTF_CACHE) {
1547                 ip6_del_rt(rt);
1548                 return;
1549         }
1550
1551 out:
1552         dst_release(&rt->dst);
1553 }
1554
1555 /*
1556  *      Handle ICMP "packet too big" messages
1557  *      i.e. Path MTU discovery
1558  */
1559
1560 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1561                              struct net *net, u32 pmtu, int ifindex)
1562 {
1563         struct rt6_info *rt, *nrt;
1564         int allfrag = 0;
1565 again:
1566         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1567         if (rt == NULL)
1568                 return;
1569
1570         if (rt6_check_expired(rt)) {
1571                 ip6_del_rt(rt);
1572                 goto again;
1573         }
1574
1575         if (pmtu >= dst_mtu(&rt->dst))
1576                 goto out;
1577
1578         if (pmtu < IPV6_MIN_MTU) {
1579                 /*
1580                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1581                  * MTU (1280) and a fragment header should always be included
1582                  * after a node receiving Too Big message reporting PMTU is
1583                  * less than the IPv6 Minimum Link MTU.
1584                  */
1585                 pmtu = IPV6_MIN_MTU;
1586                 allfrag = 1;
1587         }
1588
1589         /* New mtu received -> path was valid.
1590            They are sent only in response to data packets,
1591            so that this nexthop apparently is reachable. --ANK
1592          */
1593         dst_confirm(&rt->dst);
1594
1595         /* Host route. If it is static, it would be better
1596            not to override it, but add new one, so that
1597            when cache entry will expire old pmtu
1598            would return automatically.
1599          */
1600         if (rt->rt6i_flags & RTF_CACHE) {
1601                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1602                 if (allfrag) {
1603                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1604                         features |= RTAX_FEATURE_ALLFRAG;
1605                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1606                 }
1607                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1608                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1609                 goto out;
1610         }
1611
1612         /* Network route.
1613            Two cases are possible:
1614            1. It is connected route. Action: COW
1615            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1616          */
1617         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1618                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1619         else
1620                 nrt = rt6_alloc_clone(rt, daddr);
1621
1622         if (nrt) {
1623                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1624                 if (allfrag) {
1625                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1626                         features |= RTAX_FEATURE_ALLFRAG;
1627                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1628                 }
1629
1630                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1631                  * happened within 5 mins, the recommended timer is 10 mins.
1632                  * Here this route expiration time is set to ip6_rt_mtu_expires
1633                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1634                  * and detecting PMTU increase will be automatically happened.
1635                  */
1636                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1637                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1638
1639                 ip6_ins_rt(nrt);
1640         }
1641 out:
1642         dst_release(&rt->dst);
1643 }
1644
1645 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1646                         struct net_device *dev, u32 pmtu)
1647 {
1648         struct net *net = dev_net(dev);
1649
1650         /*
1651          * RFC 1981 states that a node "MUST reduce the size of the packets it
1652          * is sending along the path" that caused the Packet Too Big message.
1653          * Since it's not possible in the general case to determine which
1654          * interface was used to send the original packet, we update the MTU
1655          * on the interface that will be used to send future packets. We also
1656          * update the MTU on the interface that received the Packet Too Big in
1657          * case the original packet was forced out that interface with
1658          * SO_BINDTODEVICE or similar. This is the next best thing to the
1659          * correct behaviour, which would be to update the MTU on all
1660          * interfaces.
1661          */
1662         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1663         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1664 }
1665
1666 /*
1667  *      Misc support functions
1668  */
1669
1670 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1671 {
1672         struct net *net = dev_net(ort->rt6i_dev);
1673         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1674
1675         if (rt) {
1676                 rt->dst.input = ort->dst.input;
1677                 rt->dst.output = ort->dst.output;
1678
1679                 dst_copy_metrics(&rt->dst, &ort->dst);
1680                 rt->dst.error = ort->dst.error;
1681                 rt->dst.dev = ort->dst.dev;
1682                 if (rt->dst.dev)
1683                         dev_hold(rt->dst.dev);
1684                 rt->rt6i_idev = ort->rt6i_idev;
1685                 if (rt->rt6i_idev)
1686                         in6_dev_hold(rt->rt6i_idev);
1687                 rt->dst.lastuse = jiffies;
1688                 rt->rt6i_expires = 0;
1689
1690                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1691                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1692                 rt->rt6i_metric = 0;
1693
1694                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1695 #ifdef CONFIG_IPV6_SUBTREES
1696                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1697 #endif
1698                 rt->rt6i_table = ort->rt6i_table;
1699         }
1700         return rt;
1701 }
1702
1703 #ifdef CONFIG_IPV6_ROUTE_INFO
1704 static struct rt6_info *rt6_get_route_info(struct net *net,
1705                                            struct in6_addr *prefix, int prefixlen,
1706                                            struct in6_addr *gwaddr, int ifindex)
1707 {
1708         struct fib6_node *fn;
1709         struct rt6_info *rt = NULL;
1710         struct fib6_table *table;
1711
1712         table = fib6_get_table(net, RT6_TABLE_INFO);
1713         if (table == NULL)
1714                 return NULL;
1715
1716         write_lock_bh(&table->tb6_lock);
1717         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1718         if (!fn)
1719                 goto out;
1720
1721         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1722                 if (rt->rt6i_dev->ifindex != ifindex)
1723                         continue;
1724                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1725                         continue;
1726                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1727                         continue;
1728                 dst_hold(&rt->dst);
1729                 break;
1730         }
1731 out:
1732         write_unlock_bh(&table->tb6_lock);
1733         return rt;
1734 }
1735
1736 static struct rt6_info *rt6_add_route_info(struct net *net,
1737                                            struct in6_addr *prefix, int prefixlen,
1738                                            struct in6_addr *gwaddr, int ifindex,
1739                                            unsigned pref)
1740 {
1741         struct fib6_config cfg = {
1742                 .fc_table       = RT6_TABLE_INFO,
1743                 .fc_metric      = IP6_RT_PRIO_USER,
1744                 .fc_ifindex     = ifindex,
1745                 .fc_dst_len     = prefixlen,
1746                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1747                                   RTF_UP | RTF_PREF(pref),
1748                 .fc_nlinfo.pid = 0,
1749                 .fc_nlinfo.nlh = NULL,
1750                 .fc_nlinfo.nl_net = net,
1751         };
1752
1753         ipv6_addr_copy(&cfg.fc_dst, prefix);
1754         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1755
1756         /* We should treat it as a default route if prefix length is 0. */
1757         if (!prefixlen)
1758                 cfg.fc_flags |= RTF_DEFAULT;
1759
1760         ip6_route_add(&cfg);
1761
1762         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1763 }
1764 #endif
1765
1766 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1767 {
1768         struct rt6_info *rt;
1769         struct fib6_table *table;
1770
1771         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1772         if (table == NULL)
1773                 return NULL;
1774
1775         write_lock_bh(&table->tb6_lock);
1776         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1777                 if (dev == rt->rt6i_dev &&
1778                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1779                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1780                         break;
1781         }
1782         if (rt)
1783                 dst_hold(&rt->dst);
1784         write_unlock_bh(&table->tb6_lock);
1785         return rt;
1786 }
1787
1788 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1789                                      struct net_device *dev,
1790                                      unsigned int pref)
1791 {
1792         struct fib6_config cfg = {
1793                 .fc_table       = RT6_TABLE_DFLT,
1794                 .fc_metric      = IP6_RT_PRIO_USER,
1795                 .fc_ifindex     = dev->ifindex,
1796                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1797                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1798                 .fc_nlinfo.pid = 0,
1799                 .fc_nlinfo.nlh = NULL,
1800                 .fc_nlinfo.nl_net = dev_net(dev),
1801         };
1802
1803         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1804
1805         ip6_route_add(&cfg);
1806
1807         return rt6_get_dflt_router(gwaddr, dev);
1808 }
1809
1810 void rt6_purge_dflt_routers(struct net *net)
1811 {
1812         struct rt6_info *rt;
1813         struct fib6_table *table;
1814
1815         /* NOTE: Keep consistent with rt6_get_dflt_router */
1816         table = fib6_get_table(net, RT6_TABLE_DFLT);
1817         if (table == NULL)
1818                 return;
1819
1820 restart:
1821         read_lock_bh(&table->tb6_lock);
1822         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1823                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1824                         dst_hold(&rt->dst);
1825                         read_unlock_bh(&table->tb6_lock);
1826                         ip6_del_rt(rt);
1827                         goto restart;
1828                 }
1829         }
1830         read_unlock_bh(&table->tb6_lock);
1831 }
1832
1833 static void rtmsg_to_fib6_config(struct net *net,
1834                                  struct in6_rtmsg *rtmsg,
1835                                  struct fib6_config *cfg)
1836 {
1837         memset(cfg, 0, sizeof(*cfg));
1838
1839         cfg->fc_table = RT6_TABLE_MAIN;
1840         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1841         cfg->fc_metric = rtmsg->rtmsg_metric;
1842         cfg->fc_expires = rtmsg->rtmsg_info;
1843         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1844         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1845         cfg->fc_flags = rtmsg->rtmsg_flags;
1846
1847         cfg->fc_nlinfo.nl_net = net;
1848
1849         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1850         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1851         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1852 }
1853
1854 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1855 {
1856         struct fib6_config cfg;
1857         struct in6_rtmsg rtmsg;
1858         int err;
1859
1860         switch(cmd) {
1861         case SIOCADDRT:         /* Add a route */
1862         case SIOCDELRT:         /* Delete a route */
1863                 if (!capable(CAP_NET_ADMIN))
1864                         return -EPERM;
1865                 err = copy_from_user(&rtmsg, arg,
1866                                      sizeof(struct in6_rtmsg));
1867                 if (err)
1868                         return -EFAULT;
1869
1870                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1871
1872                 rtnl_lock();
1873                 switch (cmd) {
1874                 case SIOCADDRT:
1875                         err = ip6_route_add(&cfg);
1876                         break;
1877                 case SIOCDELRT:
1878                         err = ip6_route_del(&cfg);
1879                         break;
1880                 default:
1881                         err = -EINVAL;
1882                 }
1883                 rtnl_unlock();
1884
1885                 return err;
1886         }
1887
1888         return -EINVAL;
1889 }
1890
1891 /*
1892  *      Drop the packet on the floor
1893  */
1894
1895 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1896 {
1897         int type;
1898         struct dst_entry *dst = skb_dst(skb);
1899         switch (ipstats_mib_noroutes) {
1900         case IPSTATS_MIB_INNOROUTES:
1901                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1902                 if (type == IPV6_ADDR_ANY) {
1903                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1904                                       IPSTATS_MIB_INADDRERRORS);
1905                         break;
1906                 }
1907                 /* FALLTHROUGH */
1908         case IPSTATS_MIB_OUTNOROUTES:
1909                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1910                               ipstats_mib_noroutes);
1911                 break;
1912         }
1913         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1914         kfree_skb(skb);
1915         return 0;
1916 }
1917
1918 static int ip6_pkt_discard(struct sk_buff *skb)
1919 {
1920         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1921 }
1922
1923 static int ip6_pkt_discard_out(struct sk_buff *skb)
1924 {
1925         skb->dev = skb_dst(skb)->dev;
1926         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1927 }
1928
1929 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1930
1931 static int ip6_pkt_prohibit(struct sk_buff *skb)
1932 {
1933         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1934 }
1935
1936 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1937 {
1938         skb->dev = skb_dst(skb)->dev;
1939         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1940 }
1941
1942 #endif
1943
1944 /*
1945  *      Allocate a dst for local (unicast / anycast) address.
1946  */
1947
1948 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1949                                     const struct in6_addr *addr,
1950                                     int anycast)
1951 {
1952         struct net *net = dev_net(idev->dev);
1953         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1954         struct neighbour *neigh;
1955
1956         if (rt == NULL) {
1957                 if (net_ratelimit())
1958                         pr_warning("IPv6:  Maximum number of routes reached,"
1959                                    " consider increasing route/max_size.\n");
1960                 return ERR_PTR(-ENOMEM);
1961         }
1962
1963         dev_hold(net->loopback_dev);
1964         in6_dev_hold(idev);
1965
1966         rt->dst.flags = DST_HOST;
1967         rt->dst.input = ip6_input;
1968         rt->dst.output = ip6_output;
1969         rt->rt6i_dev = net->loopback_dev;
1970         rt->rt6i_idev = idev;
1971         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
1972         rt->dst.obsolete = -1;
1973
1974         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1975         if (anycast)
1976                 rt->rt6i_flags |= RTF_ANYCAST;
1977         else
1978                 rt->rt6i_flags |= RTF_LOCAL;
1979         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1980         if (IS_ERR(neigh)) {
1981                 dst_free(&rt->dst);
1982
1983                 /* We are casting this because that is the return
1984                  * value type.  But an errno encoded pointer is the
1985                  * same regardless of the underlying pointer type,
1986                  * and that's what we are returning.  So this is OK.
1987                  */
1988                 return (struct rt6_info *) neigh;
1989         }
1990         rt->rt6i_nexthop = neigh;
1991
1992         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1993         rt->rt6i_dst.plen = 128;
1994         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1995
1996         atomic_set(&rt->dst.__refcnt, 1);
1997
1998         return rt;
1999 }
2000
2001 struct arg_dev_net {
2002         struct net_device *dev;
2003         struct net *net;
2004 };
2005
2006 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2007 {
2008         const struct arg_dev_net *adn = arg;
2009         const struct net_device *dev = adn->dev;
2010
2011         if ((rt->rt6i_dev == dev || dev == NULL) &&
2012             rt != adn->net->ipv6.ip6_null_entry) {
2013                 RT6_TRACE("deleted by ifdown %p\n", rt);
2014                 return -1;
2015         }
2016         return 0;
2017 }
2018
2019 void rt6_ifdown(struct net *net, struct net_device *dev)
2020 {
2021         struct arg_dev_net adn = {
2022                 .dev = dev,
2023                 .net = net,
2024         };
2025
2026         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2027         icmp6_clean_all(fib6_ifdown, &adn);
2028 }
2029
2030 struct rt6_mtu_change_arg
2031 {
2032         struct net_device *dev;
2033         unsigned mtu;
2034 };
2035
2036 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2037 {
2038         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2039         struct inet6_dev *idev;
2040
2041         /* In IPv6 pmtu discovery is not optional,
2042            so that RTAX_MTU lock cannot disable it.
2043            We still use this lock to block changes
2044            caused by addrconf/ndisc.
2045         */
2046
2047         idev = __in6_dev_get(arg->dev);
2048         if (idev == NULL)
2049                 return 0;
2050
2051         /* For administrative MTU increase, there is no way to discover
2052            IPv6 PMTU increase, so PMTU increase should be updated here.
2053            Since RFC 1981 doesn't include administrative MTU increase
2054            update PMTU increase is a MUST. (i.e. jumbo frame)
2055          */
2056         /*
2057            If new MTU is less than route PMTU, this new MTU will be the
2058            lowest MTU in the path, update the route PMTU to reflect PMTU
2059            decreases; if new MTU is greater than route PMTU, and the
2060            old MTU is the lowest MTU in the path, update the route PMTU
2061            to reflect the increase. In this case if the other nodes' MTU
2062            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2063            PMTU discouvery.
2064          */
2065         if (rt->rt6i_dev == arg->dev &&
2066             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2067             (dst_mtu(&rt->dst) >= arg->mtu ||
2068              (dst_mtu(&rt->dst) < arg->mtu &&
2069               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2070                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2071         }
2072         return 0;
2073 }
2074
2075 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2076 {
2077         struct rt6_mtu_change_arg arg = {
2078                 .dev = dev,
2079                 .mtu = mtu,
2080         };
2081
2082         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2083 }
2084
2085 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2086         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2087         [RTA_OIF]               = { .type = NLA_U32 },
2088         [RTA_IIF]               = { .type = NLA_U32 },
2089         [RTA_PRIORITY]          = { .type = NLA_U32 },
2090         [RTA_METRICS]           = { .type = NLA_NESTED },
2091 };
2092
2093 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2094                               struct fib6_config *cfg)
2095 {
2096         struct rtmsg *rtm;
2097         struct nlattr *tb[RTA_MAX+1];
2098         int err;
2099
2100         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2101         if (err < 0)
2102                 goto errout;
2103
2104         err = -EINVAL;
2105         rtm = nlmsg_data(nlh);
2106         memset(cfg, 0, sizeof(*cfg));
2107
2108         cfg->fc_table = rtm->rtm_table;
2109         cfg->fc_dst_len = rtm->rtm_dst_len;
2110         cfg->fc_src_len = rtm->rtm_src_len;
2111         cfg->fc_flags = RTF_UP;
2112         cfg->fc_protocol = rtm->rtm_protocol;
2113
2114         if (rtm->rtm_type == RTN_UNREACHABLE)
2115                 cfg->fc_flags |= RTF_REJECT;
2116
2117         if (rtm->rtm_type == RTN_LOCAL)
2118                 cfg->fc_flags |= RTF_LOCAL;
2119
2120         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2121         cfg->fc_nlinfo.nlh = nlh;
2122         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2123
2124         if (tb[RTA_GATEWAY]) {
2125                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2126                 cfg->fc_flags |= RTF_GATEWAY;
2127         }
2128
2129         if (tb[RTA_DST]) {
2130                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2131
2132                 if (nla_len(tb[RTA_DST]) < plen)
2133                         goto errout;
2134
2135                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2136         }
2137
2138         if (tb[RTA_SRC]) {
2139                 int plen = (rtm->rtm_src_len + 7) >> 3;
2140
2141                 if (nla_len(tb[RTA_SRC]) < plen)
2142                         goto errout;
2143
2144                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2145         }
2146
2147         if (tb[RTA_OIF])
2148                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2149
2150         if (tb[RTA_PRIORITY])
2151                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2152
2153         if (tb[RTA_METRICS]) {
2154                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2155                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2156         }
2157
2158         if (tb[RTA_TABLE])
2159                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2160
2161         err = 0;
2162 errout:
2163         return err;
2164 }
2165
2166 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2167 {
2168         struct fib6_config cfg;
2169         int err;
2170
2171         err = rtm_to_fib6_config(skb, nlh, &cfg);
2172         if (err < 0)
2173                 return err;
2174
2175         return ip6_route_del(&cfg);
2176 }
2177
2178 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2179 {
2180         struct fib6_config cfg;
2181         int err;
2182
2183         err = rtm_to_fib6_config(skb, nlh, &cfg);
2184         if (err < 0)
2185                 return err;
2186
2187         return ip6_route_add(&cfg);
2188 }
2189
2190 static inline size_t rt6_nlmsg_size(void)
2191 {
2192         return NLMSG_ALIGN(sizeof(struct rtmsg))
2193                + nla_total_size(16) /* RTA_SRC */
2194                + nla_total_size(16) /* RTA_DST */
2195                + nla_total_size(16) /* RTA_GATEWAY */
2196                + nla_total_size(16) /* RTA_PREFSRC */
2197                + nla_total_size(4) /* RTA_TABLE */
2198                + nla_total_size(4) /* RTA_IIF */
2199                + nla_total_size(4) /* RTA_OIF */
2200                + nla_total_size(4) /* RTA_PRIORITY */
2201                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2202                + nla_total_size(sizeof(struct rta_cacheinfo));
2203 }
2204
2205 static int rt6_fill_node(struct net *net,
2206                          struct sk_buff *skb, struct rt6_info *rt,
2207                          struct in6_addr *dst, struct in6_addr *src,
2208                          int iif, int type, u32 pid, u32 seq,
2209                          int prefix, int nowait, unsigned int flags)
2210 {
2211         struct rtmsg *rtm;
2212         struct nlmsghdr *nlh;
2213         long expires;
2214         u32 table;
2215
2216         if (prefix) {   /* user wants prefix routes only */
2217                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2218                         /* success since this is not a prefix route */
2219                         return 1;
2220                 }
2221         }
2222
2223         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2224         if (nlh == NULL)
2225                 return -EMSGSIZE;
2226
2227         rtm = nlmsg_data(nlh);
2228         rtm->rtm_family = AF_INET6;
2229         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2230         rtm->rtm_src_len = rt->rt6i_src.plen;
2231         rtm->rtm_tos = 0;
2232         if (rt->rt6i_table)
2233                 table = rt->rt6i_table->tb6_id;
2234         else
2235                 table = RT6_TABLE_UNSPEC;
2236         rtm->rtm_table = table;
2237         NLA_PUT_U32(skb, RTA_TABLE, table);
2238         if (rt->rt6i_flags&RTF_REJECT)
2239                 rtm->rtm_type = RTN_UNREACHABLE;
2240         else if (rt->rt6i_flags&RTF_LOCAL)
2241                 rtm->rtm_type = RTN_LOCAL;
2242         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2243                 rtm->rtm_type = RTN_LOCAL;
2244         else
2245                 rtm->rtm_type = RTN_UNICAST;
2246         rtm->rtm_flags = 0;
2247         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2248         rtm->rtm_protocol = rt->rt6i_protocol;
2249         if (rt->rt6i_flags&RTF_DYNAMIC)
2250                 rtm->rtm_protocol = RTPROT_REDIRECT;
2251         else if (rt->rt6i_flags & RTF_ADDRCONF)
2252                 rtm->rtm_protocol = RTPROT_KERNEL;
2253         else if (rt->rt6i_flags&RTF_DEFAULT)
2254                 rtm->rtm_protocol = RTPROT_RA;
2255
2256         if (rt->rt6i_flags&RTF_CACHE)
2257                 rtm->rtm_flags |= RTM_F_CLONED;
2258
2259         if (dst) {
2260                 NLA_PUT(skb, RTA_DST, 16, dst);
2261                 rtm->rtm_dst_len = 128;
2262         } else if (rtm->rtm_dst_len)
2263                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2264 #ifdef CONFIG_IPV6_SUBTREES
2265         if (src) {
2266                 NLA_PUT(skb, RTA_SRC, 16, src);
2267                 rtm->rtm_src_len = 128;
2268         } else if (rtm->rtm_src_len)
2269                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2270 #endif
2271         if (iif) {
2272 #ifdef CONFIG_IPV6_MROUTE
2273                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2274                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2275                         if (err <= 0) {
2276                                 if (!nowait) {
2277                                         if (err == 0)
2278                                                 return 0;
2279                                         goto nla_put_failure;
2280                                 } else {
2281                                         if (err == -EMSGSIZE)
2282                                                 goto nla_put_failure;
2283                                 }
2284                         }
2285                 } else
2286 #endif
2287                         NLA_PUT_U32(skb, RTA_IIF, iif);
2288         } else if (dst) {
2289                 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2290                 struct in6_addr saddr_buf;
2291                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2292                                        dst, 0, &saddr_buf) == 0)
2293                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2294         }
2295
2296         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2297                 goto nla_put_failure;
2298
2299         if (rt->dst.neighbour)
2300                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2301
2302         if (rt->dst.dev)
2303                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2304
2305         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2306
2307         if (!(rt->rt6i_flags & RTF_EXPIRES))
2308                 expires = 0;
2309         else if (rt->rt6i_expires - jiffies < INT_MAX)
2310                 expires = rt->rt6i_expires - jiffies;
2311         else
2312                 expires = INT_MAX;
2313
2314         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2315                                expires, rt->dst.error) < 0)
2316                 goto nla_put_failure;
2317
2318         return nlmsg_end(skb, nlh);
2319
2320 nla_put_failure:
2321         nlmsg_cancel(skb, nlh);
2322         return -EMSGSIZE;
2323 }
2324
2325 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2326 {
2327         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2328         int prefix;
2329
2330         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2331                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2332                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2333         } else
2334                 prefix = 0;
2335
2336         return rt6_fill_node(arg->net,
2337                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2338                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2339                      prefix, 0, NLM_F_MULTI);
2340 }
2341
2342 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2343 {
2344         struct net *net = sock_net(in_skb->sk);
2345         struct nlattr *tb[RTA_MAX+1];
2346         struct rt6_info *rt;
2347         struct sk_buff *skb;
2348         struct rtmsg *rtm;
2349         struct flowi fl;
2350         int err, iif = 0;
2351
2352         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2353         if (err < 0)
2354                 goto errout;
2355
2356         err = -EINVAL;
2357         memset(&fl, 0, sizeof(fl));
2358
2359         if (tb[RTA_SRC]) {
2360                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2361                         goto errout;
2362
2363                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2364         }
2365
2366         if (tb[RTA_DST]) {
2367                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2368                         goto errout;
2369
2370                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2371         }
2372
2373         if (tb[RTA_IIF])
2374                 iif = nla_get_u32(tb[RTA_IIF]);
2375
2376         if (tb[RTA_OIF])
2377                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2378
2379         if (iif) {
2380                 struct net_device *dev;
2381                 dev = __dev_get_by_index(net, iif);
2382                 if (!dev) {
2383                         err = -ENODEV;
2384                         goto errout;
2385                 }
2386         }
2387
2388         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2389         if (skb == NULL) {
2390                 err = -ENOBUFS;
2391                 goto errout;
2392         }
2393
2394         /* Reserve room for dummy headers, this skb can pass
2395            through good chunk of routing engine.
2396          */
2397         skb_reset_mac_header(skb);
2398         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2399
2400         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2401         skb_dst_set(skb, &rt->dst);
2402
2403         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2404                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2405                             nlh->nlmsg_seq, 0, 0, 0);
2406         if (err < 0) {
2407                 kfree_skb(skb);
2408                 goto errout;
2409         }
2410
2411         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2412 errout:
2413         return err;
2414 }
2415
2416 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2417 {
2418         struct sk_buff *skb;
2419         struct net *net = info->nl_net;
2420         u32 seq;
2421         int err;
2422
2423         err = -ENOBUFS;
2424         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2425
2426         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2427         if (skb == NULL)
2428                 goto errout;
2429
2430         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2431                                 event, info->pid, seq, 0, 0, 0);
2432         if (err < 0) {
2433                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2434                 WARN_ON(err == -EMSGSIZE);
2435                 kfree_skb(skb);
2436                 goto errout;
2437         }
2438         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2439                     info->nlh, gfp_any());
2440         return;
2441 errout:
2442         if (err < 0)
2443                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2444 }
2445
2446 static int ip6_route_dev_notify(struct notifier_block *this,
2447                                 unsigned long event, void *data)
2448 {
2449         struct net_device *dev = (struct net_device *)data;
2450         struct net *net = dev_net(dev);
2451
2452         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2453                 net->ipv6.ip6_null_entry->dst.dev = dev;
2454                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2455 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2456                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2457                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2458                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2459                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2460 #endif
2461         }
2462
2463         return NOTIFY_OK;
2464 }
2465
2466 /*
2467  *      /proc
2468  */
2469
2470 #ifdef CONFIG_PROC_FS
2471
2472 struct rt6_proc_arg
2473 {
2474         char *buffer;
2475         int offset;
2476         int length;
2477         int skip;
2478         int len;
2479 };
2480
2481 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2482 {
2483         struct seq_file *m = p_arg;
2484
2485         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2486
2487 #ifdef CONFIG_IPV6_SUBTREES
2488         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2489 #else
2490         seq_puts(m, "00000000000000000000000000000000 00 ");
2491 #endif
2492
2493         if (rt->rt6i_nexthop) {
2494                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2495         } else {
2496                 seq_puts(m, "00000000000000000000000000000000");
2497         }
2498         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2499                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2500                    rt->dst.__use, rt->rt6i_flags,
2501                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2502         return 0;
2503 }
2504
2505 static int ipv6_route_show(struct seq_file *m, void *v)
2506 {
2507         struct net *net = (struct net *)m->private;
2508         fib6_clean_all(net, rt6_info_route, 0, m);
2509         return 0;
2510 }
2511
2512 static int ipv6_route_open(struct inode *inode, struct file *file)
2513 {
2514         return single_open_net(inode, file, ipv6_route_show);
2515 }
2516
2517 static const struct file_operations ipv6_route_proc_fops = {
2518         .owner          = THIS_MODULE,
2519         .open           = ipv6_route_open,
2520         .read           = seq_read,
2521         .llseek         = seq_lseek,
2522         .release        = single_release_net,
2523 };
2524
2525 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2526 {
2527         struct net *net = (struct net *)seq->private;
2528         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2529                    net->ipv6.rt6_stats->fib_nodes,
2530                    net->ipv6.rt6_stats->fib_route_nodes,
2531                    net->ipv6.rt6_stats->fib_rt_alloc,
2532                    net->ipv6.rt6_stats->fib_rt_entries,
2533                    net->ipv6.rt6_stats->fib_rt_cache,
2534                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2535                    net->ipv6.rt6_stats->fib_discarded_routes);
2536
2537         return 0;
2538 }
2539
2540 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2541 {
2542         return single_open_net(inode, file, rt6_stats_seq_show);
2543 }
2544
2545 static const struct file_operations rt6_stats_seq_fops = {
2546         .owner   = THIS_MODULE,
2547         .open    = rt6_stats_seq_open,
2548         .read    = seq_read,
2549         .llseek  = seq_lseek,
2550         .release = single_release_net,
2551 };
2552 #endif  /* CONFIG_PROC_FS */
2553
2554 #ifdef CONFIG_SYSCTL
2555
2556 static
2557 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2558                               void __user *buffer, size_t *lenp, loff_t *ppos)
2559 {
2560         struct net *net;
2561         int delay;
2562         if (!write)
2563                 return -EINVAL;
2564
2565         net = (struct net *)ctl->extra1;
2566         delay = net->ipv6.sysctl.flush_delay;
2567         proc_dointvec(ctl, write, buffer, lenp, ppos);
2568         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2569         return 0;
2570 }
2571
2572 ctl_table ipv6_route_table_template[] = {
2573         {
2574                 .procname       =       "flush",
2575                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2576                 .maxlen         =       sizeof(int),
2577                 .mode           =       0200,
2578                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2579         },
2580         {
2581                 .procname       =       "gc_thresh",
2582                 .data           =       &ip6_dst_ops_template.gc_thresh,
2583                 .maxlen         =       sizeof(int),
2584                 .mode           =       0644,
2585                 .proc_handler   =       proc_dointvec,
2586         },
2587         {
2588                 .procname       =       "max_size",
2589                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2590                 .maxlen         =       sizeof(int),
2591                 .mode           =       0644,
2592                 .proc_handler   =       proc_dointvec,
2593         },
2594         {
2595                 .procname       =       "gc_min_interval",
2596                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2597                 .maxlen         =       sizeof(int),
2598                 .mode           =       0644,
2599                 .proc_handler   =       proc_dointvec_jiffies,
2600         },
2601         {
2602                 .procname       =       "gc_timeout",
2603                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2604                 .maxlen         =       sizeof(int),
2605                 .mode           =       0644,
2606                 .proc_handler   =       proc_dointvec_jiffies,
2607         },
2608         {
2609                 .procname       =       "gc_interval",
2610                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2611                 .maxlen         =       sizeof(int),
2612                 .mode           =       0644,
2613                 .proc_handler   =       proc_dointvec_jiffies,
2614         },
2615         {
2616                 .procname       =       "gc_elasticity",
2617                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2618                 .maxlen         =       sizeof(int),
2619                 .mode           =       0644,
2620                 .proc_handler   =       proc_dointvec,
2621         },
2622         {
2623                 .procname       =       "mtu_expires",
2624                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2625                 .maxlen         =       sizeof(int),
2626                 .mode           =       0644,
2627                 .proc_handler   =       proc_dointvec_jiffies,
2628         },
2629         {
2630                 .procname       =       "min_adv_mss",
2631                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2632                 .maxlen         =       sizeof(int),
2633                 .mode           =       0644,
2634                 .proc_handler   =       proc_dointvec,
2635         },
2636         {
2637                 .procname       =       "gc_min_interval_ms",
2638                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2639                 .maxlen         =       sizeof(int),
2640                 .mode           =       0644,
2641                 .proc_handler   =       proc_dointvec_ms_jiffies,
2642         },
2643         { }
2644 };
2645
2646 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2647 {
2648         struct ctl_table *table;
2649
2650         table = kmemdup(ipv6_route_table_template,
2651                         sizeof(ipv6_route_table_template),
2652                         GFP_KERNEL);
2653
2654         if (table) {
2655                 table[0].data = &net->ipv6.sysctl.flush_delay;
2656                 table[0].extra1 = net;
2657                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2658                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2659                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2660                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2661                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2662                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2663                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2664                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2665                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2666         }
2667
2668         return table;
2669 }
2670 #endif
2671
2672 static int __net_init ip6_route_net_init(struct net *net)
2673 {
2674         int ret = -ENOMEM;
2675
2676         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2677                sizeof(net->ipv6.ip6_dst_ops));
2678
2679         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2680                 goto out_ip6_dst_ops;
2681
2682         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2683                                            sizeof(*net->ipv6.ip6_null_entry),
2684                                            GFP_KERNEL);
2685         if (!net->ipv6.ip6_null_entry)
2686                 goto out_ip6_dst_entries;
2687         net->ipv6.ip6_null_entry->dst.path =
2688                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2689         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2690         dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255);
2691
2692 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2693         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2694                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2695                                                GFP_KERNEL);
2696         if (!net->ipv6.ip6_prohibit_entry)
2697                 goto out_ip6_null_entry;
2698         net->ipv6.ip6_prohibit_entry->dst.path =
2699                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2700         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2701         dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255);
2702
2703         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2704                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2705                                                GFP_KERNEL);
2706         if (!net->ipv6.ip6_blk_hole_entry)
2707                 goto out_ip6_prohibit_entry;
2708         net->ipv6.ip6_blk_hole_entry->dst.path =
2709                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2710         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2711         dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255);
2712 #endif
2713
2714         net->ipv6.sysctl.flush_delay = 0;
2715         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2716         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2717         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2718         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2719         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2720         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2721         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2722
2723 #ifdef CONFIG_PROC_FS
2724         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2725         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2726 #endif
2727         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2728
2729         ret = 0;
2730 out:
2731         return ret;
2732
2733 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2734 out_ip6_prohibit_entry:
2735         kfree(net->ipv6.ip6_prohibit_entry);
2736 out_ip6_null_entry:
2737         kfree(net->ipv6.ip6_null_entry);
2738 #endif
2739 out_ip6_dst_entries:
2740         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2741 out_ip6_dst_ops:
2742         goto out;
2743 }
2744
2745 static void __net_exit ip6_route_net_exit(struct net *net)
2746 {
2747 #ifdef CONFIG_PROC_FS
2748         proc_net_remove(net, "ipv6_route");
2749         proc_net_remove(net, "rt6_stats");
2750 #endif
2751         kfree(net->ipv6.ip6_null_entry);
2752 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2753         kfree(net->ipv6.ip6_prohibit_entry);
2754         kfree(net->ipv6.ip6_blk_hole_entry);
2755 #endif
2756         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2757 }
2758
2759 static struct pernet_operations ip6_route_net_ops = {
2760         .init = ip6_route_net_init,
2761         .exit = ip6_route_net_exit,
2762 };
2763
2764 static struct notifier_block ip6_route_dev_notifier = {
2765         .notifier_call = ip6_route_dev_notify,
2766         .priority = 0,
2767 };
2768
2769 int __init ip6_route_init(void)
2770 {
2771         int ret;
2772
2773         ret = -ENOMEM;
2774         ip6_dst_ops_template.kmem_cachep =
2775                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2776                                   SLAB_HWCACHE_ALIGN, NULL);
2777         if (!ip6_dst_ops_template.kmem_cachep)
2778                 goto out;
2779
2780         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2781         if (ret)
2782                 goto out_kmem_cache;
2783
2784         ret = register_pernet_subsys(&ip6_route_net_ops);
2785         if (ret)
2786                 goto out_dst_entries;
2787
2788         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2789
2790         /* Registering of the loopback is done before this portion of code,
2791          * the loopback reference in rt6_info will not be taken, do it
2792          * manually for init_net */
2793         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2794         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2795   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2796         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2797         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2798         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2799         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2800   #endif
2801         ret = fib6_init();
2802         if (ret)
2803                 goto out_register_subsys;
2804
2805         ret = xfrm6_init();
2806         if (ret)
2807                 goto out_fib6_init;
2808
2809         ret = fib6_rules_init();
2810         if (ret)
2811                 goto xfrm6_init;
2812
2813         ret = -ENOBUFS;
2814         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2815             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2816             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2817                 goto fib6_rules_init;
2818
2819         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2820         if (ret)
2821                 goto fib6_rules_init;
2822
2823 out:
2824         return ret;
2825
2826 fib6_rules_init:
2827         fib6_rules_cleanup();
2828 xfrm6_init:
2829         xfrm6_fini();
2830 out_fib6_init:
2831         fib6_gc_cleanup();
2832 out_register_subsys:
2833         unregister_pernet_subsys(&ip6_route_net_ops);
2834 out_dst_entries:
2835         dst_entries_destroy(&ip6_dst_blackhole_ops);
2836 out_kmem_cache:
2837         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2838         goto out;
2839 }
2840
2841 void ip6_route_cleanup(void)
2842 {
2843         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2844         fib6_rules_cleanup();
2845         xfrm6_fini();
2846         fib6_gc_cleanup();
2847         unregister_pernet_subsys(&ip6_route_net_ops);
2848         dst_entries_destroy(&ip6_dst_blackhole_ops);
2849         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2850 }