]> git.karo-electronics.de Git - mv-sheeva.git/blob - net/ipv6/route.c
net: Abstract RTAX_HOPLIMIT metric accesses behind helper.
[mv-sheeva.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static struct dst_ops ip6_dst_ops_template = {
101         .family                 =       AF_INET6,
102         .protocol               =       cpu_to_be16(ETH_P_IPV6),
103         .gc                     =       ip6_dst_gc,
104         .gc_thresh              =       1024,
105         .check                  =       ip6_dst_check,
106         .destroy                =       ip6_dst_destroy,
107         .ifdown                 =       ip6_dst_ifdown,
108         .negative_advice        =       ip6_negative_advice,
109         .link_failure           =       ip6_link_failure,
110         .update_pmtu            =       ip6_rt_update_pmtu,
111         .local_out              =       __ip6_local_out,
112 };
113
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
115 {
116 }
117
118 static struct dst_ops ip6_dst_blackhole_ops = {
119         .family                 =       AF_INET6,
120         .protocol               =       cpu_to_be16(ETH_P_IPV6),
121         .destroy                =       ip6_dst_destroy,
122         .check                  =       ip6_dst_check,
123         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
124 };
125
126 static struct rt6_info ip6_null_entry_template = {
127         .dst = {
128                 .__refcnt       = ATOMIC_INIT(1),
129                 .__use          = 1,
130                 .obsolete       = -1,
131                 .error          = -ENETUNREACH,
132                 .input          = ip6_pkt_discard,
133                 .output         = ip6_pkt_discard_out,
134         },
135         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
136         .rt6i_protocol  = RTPROT_KERNEL,
137         .rt6i_metric    = ~(u32) 0,
138         .rt6i_ref       = ATOMIC_INIT(1),
139 };
140
141 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
142
143 static int ip6_pkt_prohibit(struct sk_buff *skb);
144 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
145
146 static struct rt6_info ip6_prohibit_entry_template = {
147         .dst = {
148                 .__refcnt       = ATOMIC_INIT(1),
149                 .__use          = 1,
150                 .obsolete       = -1,
151                 .error          = -EACCES,
152                 .input          = ip6_pkt_prohibit,
153                 .output         = ip6_pkt_prohibit_out,
154         },
155         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
156         .rt6i_protocol  = RTPROT_KERNEL,
157         .rt6i_metric    = ~(u32) 0,
158         .rt6i_ref       = ATOMIC_INIT(1),
159 };
160
161 static struct rt6_info ip6_blk_hole_entry_template = {
162         .dst = {
163                 .__refcnt       = ATOMIC_INIT(1),
164                 .__use          = 1,
165                 .obsolete       = -1,
166                 .error          = -EINVAL,
167                 .input          = dst_discard,
168                 .output         = dst_discard,
169         },
170         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
171         .rt6i_protocol  = RTPROT_KERNEL,
172         .rt6i_metric    = ~(u32) 0,
173         .rt6i_ref       = ATOMIC_INIT(1),
174 };
175
176 #endif
177
178 /* allocate dst with ip6_dst_ops */
179 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
180 {
181         return (struct rt6_info *)dst_alloc(ops);
182 }
183
184 static void ip6_dst_destroy(struct dst_entry *dst)
185 {
186         struct rt6_info *rt = (struct rt6_info *)dst;
187         struct inet6_dev *idev = rt->rt6i_idev;
188         struct inet_peer *peer = rt->rt6i_peer;
189
190         if (idev != NULL) {
191                 rt->rt6i_idev = NULL;
192                 in6_dev_put(idev);
193         }
194         if (peer) {
195                 BUG_ON(!(rt->rt6i_flags & RTF_CACHE));
196                 rt->rt6i_peer = NULL;
197                 inet_putpeer(peer);
198         }
199 }
200
201 void rt6_bind_peer(struct rt6_info *rt, int create)
202 {
203         struct inet_peer *peer;
204
205         if (WARN_ON(!(rt->rt6i_flags & RTF_CACHE)))
206                 return;
207
208         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
209         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
210                 inet_putpeer(peer);
211 }
212
213 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
214                            int how)
215 {
216         struct rt6_info *rt = (struct rt6_info *)dst;
217         struct inet6_dev *idev = rt->rt6i_idev;
218         struct net_device *loopback_dev =
219                 dev_net(dev)->loopback_dev;
220
221         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
222                 struct inet6_dev *loopback_idev =
223                         in6_dev_get(loopback_dev);
224                 if (loopback_idev != NULL) {
225                         rt->rt6i_idev = loopback_idev;
226                         in6_dev_put(idev);
227                 }
228         }
229 }
230
231 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
232 {
233         return (rt->rt6i_flags & RTF_EXPIRES) &&
234                 time_after(jiffies, rt->rt6i_expires);
235 }
236
237 static inline int rt6_need_strict(struct in6_addr *daddr)
238 {
239         return ipv6_addr_type(daddr) &
240                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
241 }
242
243 /*
244  *      Route lookup. Any table->tb6_lock is implied.
245  */
246
247 static inline struct rt6_info *rt6_device_match(struct net *net,
248                                                     struct rt6_info *rt,
249                                                     struct in6_addr *saddr,
250                                                     int oif,
251                                                     int flags)
252 {
253         struct rt6_info *local = NULL;
254         struct rt6_info *sprt;
255
256         if (!oif && ipv6_addr_any(saddr))
257                 goto out;
258
259         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
260                 struct net_device *dev = sprt->rt6i_dev;
261
262                 if (oif) {
263                         if (dev->ifindex == oif)
264                                 return sprt;
265                         if (dev->flags & IFF_LOOPBACK) {
266                                 if (sprt->rt6i_idev == NULL ||
267                                     sprt->rt6i_idev->dev->ifindex != oif) {
268                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
269                                                 continue;
270                                         if (local && (!oif ||
271                                                       local->rt6i_idev->dev->ifindex == oif))
272                                                 continue;
273                                 }
274                                 local = sprt;
275                         }
276                 } else {
277                         if (ipv6_chk_addr(net, saddr, dev,
278                                           flags & RT6_LOOKUP_F_IFACE))
279                                 return sprt;
280                 }
281         }
282
283         if (oif) {
284                 if (local)
285                         return local;
286
287                 if (flags & RT6_LOOKUP_F_IFACE)
288                         return net->ipv6.ip6_null_entry;
289         }
290 out:
291         return rt;
292 }
293
294 #ifdef CONFIG_IPV6_ROUTER_PREF
295 static void rt6_probe(struct rt6_info *rt)
296 {
297         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
298         /*
299          * Okay, this does not seem to be appropriate
300          * for now, however, we need to check if it
301          * is really so; aka Router Reachability Probing.
302          *
303          * Router Reachability Probe MUST be rate-limited
304          * to no more than one per minute.
305          */
306         if (!neigh || (neigh->nud_state & NUD_VALID))
307                 return;
308         read_lock_bh(&neigh->lock);
309         if (!(neigh->nud_state & NUD_VALID) &&
310             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
311                 struct in6_addr mcaddr;
312                 struct in6_addr *target;
313
314                 neigh->updated = jiffies;
315                 read_unlock_bh(&neigh->lock);
316
317                 target = (struct in6_addr *)&neigh->primary_key;
318                 addrconf_addr_solict_mult(target, &mcaddr);
319                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
320         } else
321                 read_unlock_bh(&neigh->lock);
322 }
323 #else
324 static inline void rt6_probe(struct rt6_info *rt)
325 {
326 }
327 #endif
328
329 /*
330  * Default Router Selection (RFC 2461 6.3.6)
331  */
332 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
333 {
334         struct net_device *dev = rt->rt6i_dev;
335         if (!oif || dev->ifindex == oif)
336                 return 2;
337         if ((dev->flags & IFF_LOOPBACK) &&
338             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
339                 return 1;
340         return 0;
341 }
342
343 static inline int rt6_check_neigh(struct rt6_info *rt)
344 {
345         struct neighbour *neigh = rt->rt6i_nexthop;
346         int m;
347         if (rt->rt6i_flags & RTF_NONEXTHOP ||
348             !(rt->rt6i_flags & RTF_GATEWAY))
349                 m = 1;
350         else if (neigh) {
351                 read_lock_bh(&neigh->lock);
352                 if (neigh->nud_state & NUD_VALID)
353                         m = 2;
354 #ifdef CONFIG_IPV6_ROUTER_PREF
355                 else if (neigh->nud_state & NUD_FAILED)
356                         m = 0;
357 #endif
358                 else
359                         m = 1;
360                 read_unlock_bh(&neigh->lock);
361         } else
362                 m = 0;
363         return m;
364 }
365
366 static int rt6_score_route(struct rt6_info *rt, int oif,
367                            int strict)
368 {
369         int m, n;
370
371         m = rt6_check_dev(rt, oif);
372         if (!m && (strict & RT6_LOOKUP_F_IFACE))
373                 return -1;
374 #ifdef CONFIG_IPV6_ROUTER_PREF
375         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
376 #endif
377         n = rt6_check_neigh(rt);
378         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
379                 return -1;
380         return m;
381 }
382
383 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
384                                    int *mpri, struct rt6_info *match)
385 {
386         int m;
387
388         if (rt6_check_expired(rt))
389                 goto out;
390
391         m = rt6_score_route(rt, oif, strict);
392         if (m < 0)
393                 goto out;
394
395         if (m > *mpri) {
396                 if (strict & RT6_LOOKUP_F_REACHABLE)
397                         rt6_probe(match);
398                 *mpri = m;
399                 match = rt;
400         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
401                 rt6_probe(rt);
402         }
403
404 out:
405         return match;
406 }
407
408 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
409                                      struct rt6_info *rr_head,
410                                      u32 metric, int oif, int strict)
411 {
412         struct rt6_info *rt, *match;
413         int mpri = -1;
414
415         match = NULL;
416         for (rt = rr_head; rt && rt->rt6i_metric == metric;
417              rt = rt->dst.rt6_next)
418                 match = find_match(rt, oif, strict, &mpri, match);
419         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
420              rt = rt->dst.rt6_next)
421                 match = find_match(rt, oif, strict, &mpri, match);
422
423         return match;
424 }
425
426 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
427 {
428         struct rt6_info *match, *rt0;
429         struct net *net;
430
431         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
432                   __func__, fn->leaf, oif);
433
434         rt0 = fn->rr_ptr;
435         if (!rt0)
436                 fn->rr_ptr = rt0 = fn->leaf;
437
438         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
439
440         if (!match &&
441             (strict & RT6_LOOKUP_F_REACHABLE)) {
442                 struct rt6_info *next = rt0->dst.rt6_next;
443
444                 /* no entries matched; do round-robin */
445                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
446                         next = fn->leaf;
447
448                 if (next != rt0)
449                         fn->rr_ptr = next;
450         }
451
452         RT6_TRACE("%s() => %p\n",
453                   __func__, match);
454
455         net = dev_net(rt0->rt6i_dev);
456         return match ? match : net->ipv6.ip6_null_entry;
457 }
458
459 #ifdef CONFIG_IPV6_ROUTE_INFO
460 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
461                   struct in6_addr *gwaddr)
462 {
463         struct net *net = dev_net(dev);
464         struct route_info *rinfo = (struct route_info *) opt;
465         struct in6_addr prefix_buf, *prefix;
466         unsigned int pref;
467         unsigned long lifetime;
468         struct rt6_info *rt;
469
470         if (len < sizeof(struct route_info)) {
471                 return -EINVAL;
472         }
473
474         /* Sanity check for prefix_len and length */
475         if (rinfo->length > 3) {
476                 return -EINVAL;
477         } else if (rinfo->prefix_len > 128) {
478                 return -EINVAL;
479         } else if (rinfo->prefix_len > 64) {
480                 if (rinfo->length < 2) {
481                         return -EINVAL;
482                 }
483         } else if (rinfo->prefix_len > 0) {
484                 if (rinfo->length < 1) {
485                         return -EINVAL;
486                 }
487         }
488
489         pref = rinfo->route_pref;
490         if (pref == ICMPV6_ROUTER_PREF_INVALID)
491                 return -EINVAL;
492
493         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
494
495         if (rinfo->length == 3)
496                 prefix = (struct in6_addr *)rinfo->prefix;
497         else {
498                 /* this function is safe */
499                 ipv6_addr_prefix(&prefix_buf,
500                                  (struct in6_addr *)rinfo->prefix,
501                                  rinfo->prefix_len);
502                 prefix = &prefix_buf;
503         }
504
505         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
506                                 dev->ifindex);
507
508         if (rt && !lifetime) {
509                 ip6_del_rt(rt);
510                 rt = NULL;
511         }
512
513         if (!rt && lifetime)
514                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
515                                         pref);
516         else if (rt)
517                 rt->rt6i_flags = RTF_ROUTEINFO |
518                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
519
520         if (rt) {
521                 if (!addrconf_finite_timeout(lifetime)) {
522                         rt->rt6i_flags &= ~RTF_EXPIRES;
523                 } else {
524                         rt->rt6i_expires = jiffies + HZ * lifetime;
525                         rt->rt6i_flags |= RTF_EXPIRES;
526                 }
527                 dst_release(&rt->dst);
528         }
529         return 0;
530 }
531 #endif
532
533 #define BACKTRACK(__net, saddr)                 \
534 do { \
535         if (rt == __net->ipv6.ip6_null_entry) { \
536                 struct fib6_node *pn; \
537                 while (1) { \
538                         if (fn->fn_flags & RTN_TL_ROOT) \
539                                 goto out; \
540                         pn = fn->parent; \
541                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
542                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
543                         else \
544                                 fn = pn; \
545                         if (fn->fn_flags & RTN_RTINFO) \
546                                 goto restart; \
547                 } \
548         } \
549 } while(0)
550
551 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
552                                              struct fib6_table *table,
553                                              struct flowi *fl, int flags)
554 {
555         struct fib6_node *fn;
556         struct rt6_info *rt;
557
558         read_lock_bh(&table->tb6_lock);
559         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
560 restart:
561         rt = fn->leaf;
562         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
563         BACKTRACK(net, &fl->fl6_src);
564 out:
565         dst_use(&rt->dst, jiffies);
566         read_unlock_bh(&table->tb6_lock);
567         return rt;
568
569 }
570
571 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
572                             const struct in6_addr *saddr, int oif, int strict)
573 {
574         struct flowi fl = {
575                 .oif = oif,
576                 .fl6_dst = *daddr,
577         };
578         struct dst_entry *dst;
579         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
580
581         if (saddr) {
582                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
583                 flags |= RT6_LOOKUP_F_HAS_SADDR;
584         }
585
586         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
587         if (dst->error == 0)
588                 return (struct rt6_info *) dst;
589
590         dst_release(dst);
591
592         return NULL;
593 }
594
595 EXPORT_SYMBOL(rt6_lookup);
596
597 /* ip6_ins_rt is called with FREE table->tb6_lock.
598    It takes new route entry, the addition fails by any reason the
599    route is freed. In any case, if caller does not hold it, it may
600    be destroyed.
601  */
602
603 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
604 {
605         int err;
606         struct fib6_table *table;
607
608         table = rt->rt6i_table;
609         write_lock_bh(&table->tb6_lock);
610         err = fib6_add(&table->tb6_root, rt, info);
611         write_unlock_bh(&table->tb6_lock);
612
613         return err;
614 }
615
616 int ip6_ins_rt(struct rt6_info *rt)
617 {
618         struct nl_info info = {
619                 .nl_net = dev_net(rt->rt6i_dev),
620         };
621         return __ip6_ins_rt(rt, &info);
622 }
623
624 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
625                                       struct in6_addr *saddr)
626 {
627         struct rt6_info *rt;
628
629         /*
630          *      Clone the route.
631          */
632
633         rt = ip6_rt_copy(ort);
634
635         if (rt) {
636                 struct neighbour *neigh;
637                 int attempts = !in_softirq();
638
639                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
640                         if (rt->rt6i_dst.plen != 128 &&
641                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
642                                 rt->rt6i_flags |= RTF_ANYCAST;
643                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
644                 }
645
646                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
647                 rt->rt6i_dst.plen = 128;
648                 rt->rt6i_flags |= RTF_CACHE;
649                 rt->dst.flags |= DST_HOST;
650
651 #ifdef CONFIG_IPV6_SUBTREES
652                 if (rt->rt6i_src.plen && saddr) {
653                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
654                         rt->rt6i_src.plen = 128;
655                 }
656 #endif
657
658         retry:
659                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
660                 if (IS_ERR(neigh)) {
661                         struct net *net = dev_net(rt->rt6i_dev);
662                         int saved_rt_min_interval =
663                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
664                         int saved_rt_elasticity =
665                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
666
667                         if (attempts-- > 0) {
668                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
669                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
670
671                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
672
673                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
674                                         saved_rt_elasticity;
675                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
676                                         saved_rt_min_interval;
677                                 goto retry;
678                         }
679
680                         if (net_ratelimit())
681                                 printk(KERN_WARNING
682                                        "ipv6: Neighbour table overflow.\n");
683                         dst_free(&rt->dst);
684                         return NULL;
685                 }
686                 rt->rt6i_nexthop = neigh;
687
688         }
689
690         return rt;
691 }
692
693 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
694 {
695         struct rt6_info *rt = ip6_rt_copy(ort);
696         if (rt) {
697                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
698                 rt->rt6i_dst.plen = 128;
699                 rt->rt6i_flags |= RTF_CACHE;
700                 rt->dst.flags |= DST_HOST;
701                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
702         }
703         return rt;
704 }
705
706 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
707                                       struct flowi *fl, int flags)
708 {
709         struct fib6_node *fn;
710         struct rt6_info *rt, *nrt;
711         int strict = 0;
712         int attempts = 3;
713         int err;
714         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
715
716         strict |= flags & RT6_LOOKUP_F_IFACE;
717
718 relookup:
719         read_lock_bh(&table->tb6_lock);
720
721 restart_2:
722         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
723
724 restart:
725         rt = rt6_select(fn, oif, strict | reachable);
726
727         BACKTRACK(net, &fl->fl6_src);
728         if (rt == net->ipv6.ip6_null_entry ||
729             rt->rt6i_flags & RTF_CACHE)
730                 goto out;
731
732         dst_hold(&rt->dst);
733         read_unlock_bh(&table->tb6_lock);
734
735         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
736                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
737         else {
738 #if CLONE_OFFLINK_ROUTE
739                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
740 #else
741                 goto out2;
742 #endif
743         }
744
745         dst_release(&rt->dst);
746         rt = nrt ? : net->ipv6.ip6_null_entry;
747
748         dst_hold(&rt->dst);
749         if (nrt) {
750                 err = ip6_ins_rt(nrt);
751                 if (!err)
752                         goto out2;
753         }
754
755         if (--attempts <= 0)
756                 goto out2;
757
758         /*
759          * Race condition! In the gap, when table->tb6_lock was
760          * released someone could insert this route.  Relookup.
761          */
762         dst_release(&rt->dst);
763         goto relookup;
764
765 out:
766         if (reachable) {
767                 reachable = 0;
768                 goto restart_2;
769         }
770         dst_hold(&rt->dst);
771         read_unlock_bh(&table->tb6_lock);
772 out2:
773         rt->dst.lastuse = jiffies;
774         rt->dst.__use++;
775
776         return rt;
777 }
778
779 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
780                                             struct flowi *fl, int flags)
781 {
782         return ip6_pol_route(net, table, fl->iif, fl, flags);
783 }
784
785 void ip6_route_input(struct sk_buff *skb)
786 {
787         struct ipv6hdr *iph = ipv6_hdr(skb);
788         struct net *net = dev_net(skb->dev);
789         int flags = RT6_LOOKUP_F_HAS_SADDR;
790         struct flowi fl = {
791                 .iif = skb->dev->ifindex,
792                 .fl6_dst = iph->daddr,
793                 .fl6_src = iph->saddr,
794                 .fl6_flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
795                 .mark = skb->mark,
796                 .proto = iph->nexthdr,
797         };
798
799         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
800                 flags |= RT6_LOOKUP_F_IFACE;
801
802         skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
803 }
804
805 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
806                                              struct flowi *fl, int flags)
807 {
808         return ip6_pol_route(net, table, fl->oif, fl, flags);
809 }
810
811 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
812                                     struct flowi *fl)
813 {
814         int flags = 0;
815
816         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
817                 flags |= RT6_LOOKUP_F_IFACE;
818
819         if (!ipv6_addr_any(&fl->fl6_src))
820                 flags |= RT6_LOOKUP_F_HAS_SADDR;
821         else if (sk)
822                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
823
824         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
825 }
826
827 EXPORT_SYMBOL(ip6_route_output);
828
829 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
830 {
831         struct rt6_info *ort = (struct rt6_info *) *dstp;
832         struct rt6_info *rt = (struct rt6_info *)
833                 dst_alloc(&ip6_dst_blackhole_ops);
834         struct dst_entry *new = NULL;
835
836         if (rt) {
837                 new = &rt->dst;
838
839                 atomic_set(&new->__refcnt, 1);
840                 new->__use = 1;
841                 new->input = dst_discard;
842                 new->output = dst_discard;
843
844                 dst_copy_metrics(new, &ort->dst);
845                 new->dev = ort->dst.dev;
846                 if (new->dev)
847                         dev_hold(new->dev);
848                 rt->rt6i_idev = ort->rt6i_idev;
849                 if (rt->rt6i_idev)
850                         in6_dev_hold(rt->rt6i_idev);
851                 rt->rt6i_expires = 0;
852
853                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
854                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
855                 rt->rt6i_metric = 0;
856
857                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
858 #ifdef CONFIG_IPV6_SUBTREES
859                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
860 #endif
861
862                 dst_free(new);
863         }
864
865         dst_release(*dstp);
866         *dstp = new;
867         return new ? 0 : -ENOMEM;
868 }
869 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
870
871 /*
872  *      Destination cache support functions
873  */
874
875 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
876 {
877         struct rt6_info *rt;
878
879         rt = (struct rt6_info *) dst;
880
881         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
882                 return dst;
883
884         return NULL;
885 }
886
887 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
888 {
889         struct rt6_info *rt = (struct rt6_info *) dst;
890
891         if (rt) {
892                 if (rt->rt6i_flags & RTF_CACHE) {
893                         if (rt6_check_expired(rt)) {
894                                 ip6_del_rt(rt);
895                                 dst = NULL;
896                         }
897                 } else {
898                         dst_release(dst);
899                         dst = NULL;
900                 }
901         }
902         return dst;
903 }
904
905 static void ip6_link_failure(struct sk_buff *skb)
906 {
907         struct rt6_info *rt;
908
909         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
910
911         rt = (struct rt6_info *) skb_dst(skb);
912         if (rt) {
913                 if (rt->rt6i_flags&RTF_CACHE) {
914                         dst_set_expires(&rt->dst, 0);
915                         rt->rt6i_flags |= RTF_EXPIRES;
916                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
917                         rt->rt6i_node->fn_sernum = -1;
918         }
919 }
920
921 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
922 {
923         struct rt6_info *rt6 = (struct rt6_info*)dst;
924
925         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
926                 rt6->rt6i_flags |= RTF_MODIFIED;
927                 if (mtu < IPV6_MIN_MTU) {
928                         u32 features = dst_metric(dst, RTAX_FEATURES);
929                         mtu = IPV6_MIN_MTU;
930                         features |= RTAX_FEATURE_ALLFRAG;
931                         dst_metric_set(dst, RTAX_FEATURES, features);
932                 }
933                 dst_metric_set(dst, RTAX_MTU, mtu);
934                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
935         }
936 }
937
938 static int ipv6_get_mtu(struct net_device *dev);
939
940 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
941 {
942         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
943
944         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
945                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
946
947         /*
948          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
949          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
950          * IPV6_MAXPLEN is also valid and means: "any MSS,
951          * rely only on pmtu discovery"
952          */
953         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
954                 mtu = IPV6_MAXPLEN;
955         return mtu;
956 }
957
958 static struct dst_entry *icmp6_dst_gc_list;
959 static DEFINE_SPINLOCK(icmp6_dst_lock);
960
961 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
962                                   struct neighbour *neigh,
963                                   const struct in6_addr *addr)
964 {
965         struct rt6_info *rt;
966         struct inet6_dev *idev = in6_dev_get(dev);
967         struct net *net = dev_net(dev);
968
969         if (unlikely(idev == NULL))
970                 return NULL;
971
972         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
973         if (unlikely(rt == NULL)) {
974                 in6_dev_put(idev);
975                 goto out;
976         }
977
978         dev_hold(dev);
979         if (neigh)
980                 neigh_hold(neigh);
981         else {
982                 neigh = ndisc_get_neigh(dev, addr);
983                 if (IS_ERR(neigh))
984                         neigh = NULL;
985         }
986
987         rt->rt6i_dev      = dev;
988         rt->rt6i_idev     = idev;
989         rt->rt6i_nexthop  = neigh;
990         atomic_set(&rt->dst.__refcnt, 1);
991         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
992         dst_metric_set(&rt->dst, RTAX_MTU, ipv6_get_mtu(rt->rt6i_dev));
993         dst_metric_set(&rt->dst, RTAX_ADVMSS, ipv6_advmss(net, dst_mtu(&rt->dst)));
994         rt->dst.output  = ip6_output;
995
996 #if 0   /* there's no chance to use these for ndisc */
997         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
998                                 ? DST_HOST
999                                 : 0;
1000         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1001         rt->rt6i_dst.plen = 128;
1002 #endif
1003
1004         spin_lock_bh(&icmp6_dst_lock);
1005         rt->dst.next = icmp6_dst_gc_list;
1006         icmp6_dst_gc_list = &rt->dst;
1007         spin_unlock_bh(&icmp6_dst_lock);
1008
1009         fib6_force_start_gc(net);
1010
1011 out:
1012         return &rt->dst;
1013 }
1014
1015 int icmp6_dst_gc(void)
1016 {
1017         struct dst_entry *dst, *next, **pprev;
1018         int more = 0;
1019
1020         next = NULL;
1021
1022         spin_lock_bh(&icmp6_dst_lock);
1023         pprev = &icmp6_dst_gc_list;
1024
1025         while ((dst = *pprev) != NULL) {
1026                 if (!atomic_read(&dst->__refcnt)) {
1027                         *pprev = dst->next;
1028                         dst_free(dst);
1029                 } else {
1030                         pprev = &dst->next;
1031                         ++more;
1032                 }
1033         }
1034
1035         spin_unlock_bh(&icmp6_dst_lock);
1036
1037         return more;
1038 }
1039
1040 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1041                             void *arg)
1042 {
1043         struct dst_entry *dst, **pprev;
1044
1045         spin_lock_bh(&icmp6_dst_lock);
1046         pprev = &icmp6_dst_gc_list;
1047         while ((dst = *pprev) != NULL) {
1048                 struct rt6_info *rt = (struct rt6_info *) dst;
1049                 if (func(rt, arg)) {
1050                         *pprev = dst->next;
1051                         dst_free(dst);
1052                 } else {
1053                         pprev = &dst->next;
1054                 }
1055         }
1056         spin_unlock_bh(&icmp6_dst_lock);
1057 }
1058
1059 static int ip6_dst_gc(struct dst_ops *ops)
1060 {
1061         unsigned long now = jiffies;
1062         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1063         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1064         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1065         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1066         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1067         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1068         int entries;
1069
1070         entries = dst_entries_get_fast(ops);
1071         if (time_after(rt_last_gc + rt_min_interval, now) &&
1072             entries <= rt_max_size)
1073                 goto out;
1074
1075         net->ipv6.ip6_rt_gc_expire++;
1076         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1077         net->ipv6.ip6_rt_last_gc = now;
1078         entries = dst_entries_get_slow(ops);
1079         if (entries < ops->gc_thresh)
1080                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1081 out:
1082         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1083         return entries > rt_max_size;
1084 }
1085
1086 /* Clean host part of a prefix. Not necessary in radix tree,
1087    but results in cleaner routing tables.
1088
1089    Remove it only when all the things will work!
1090  */
1091
1092 static int ipv6_get_mtu(struct net_device *dev)
1093 {
1094         int mtu = IPV6_MIN_MTU;
1095         struct inet6_dev *idev;
1096
1097         rcu_read_lock();
1098         idev = __in6_dev_get(dev);
1099         if (idev)
1100                 mtu = idev->cnf.mtu6;
1101         rcu_read_unlock();
1102         return mtu;
1103 }
1104
1105 int ip6_dst_hoplimit(struct dst_entry *dst)
1106 {
1107         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1108         if (hoplimit < 0) {
1109                 struct net_device *dev = dst->dev;
1110                 struct inet6_dev *idev;
1111
1112                 rcu_read_lock();
1113                 idev = __in6_dev_get(dev);
1114                 if (idev)
1115                         hoplimit = idev->cnf.hop_limit;
1116                 else
1117                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1118                 rcu_read_unlock();
1119         }
1120         return hoplimit;
1121 }
1122 EXPORT_SYMBOL(ip6_dst_hoplimit);
1123
1124 /*
1125  *
1126  */
1127
1128 int ip6_route_add(struct fib6_config *cfg)
1129 {
1130         int err;
1131         struct net *net = cfg->fc_nlinfo.nl_net;
1132         struct rt6_info *rt = NULL;
1133         struct net_device *dev = NULL;
1134         struct inet6_dev *idev = NULL;
1135         struct fib6_table *table;
1136         int addr_type;
1137
1138         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1139                 return -EINVAL;
1140 #ifndef CONFIG_IPV6_SUBTREES
1141         if (cfg->fc_src_len)
1142                 return -EINVAL;
1143 #endif
1144         if (cfg->fc_ifindex) {
1145                 err = -ENODEV;
1146                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1147                 if (!dev)
1148                         goto out;
1149                 idev = in6_dev_get(dev);
1150                 if (!idev)
1151                         goto out;
1152         }
1153
1154         if (cfg->fc_metric == 0)
1155                 cfg->fc_metric = IP6_RT_PRIO_USER;
1156
1157         table = fib6_new_table(net, cfg->fc_table);
1158         if (table == NULL) {
1159                 err = -ENOBUFS;
1160                 goto out;
1161         }
1162
1163         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1164
1165         if (rt == NULL) {
1166                 err = -ENOMEM;
1167                 goto out;
1168         }
1169
1170         rt->dst.obsolete = -1;
1171         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1172                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1173                                 0;
1174
1175         if (cfg->fc_protocol == RTPROT_UNSPEC)
1176                 cfg->fc_protocol = RTPROT_BOOT;
1177         rt->rt6i_protocol = cfg->fc_protocol;
1178
1179         addr_type = ipv6_addr_type(&cfg->fc_dst);
1180
1181         if (addr_type & IPV6_ADDR_MULTICAST)
1182                 rt->dst.input = ip6_mc_input;
1183         else if (cfg->fc_flags & RTF_LOCAL)
1184                 rt->dst.input = ip6_input;
1185         else
1186                 rt->dst.input = ip6_forward;
1187
1188         rt->dst.output = ip6_output;
1189
1190         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1191         rt->rt6i_dst.plen = cfg->fc_dst_len;
1192         if (rt->rt6i_dst.plen == 128)
1193                rt->dst.flags = DST_HOST;
1194
1195 #ifdef CONFIG_IPV6_SUBTREES
1196         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1197         rt->rt6i_src.plen = cfg->fc_src_len;
1198 #endif
1199
1200         rt->rt6i_metric = cfg->fc_metric;
1201
1202         /* We cannot add true routes via loopback here,
1203            they would result in kernel looping; promote them to reject routes
1204          */
1205         if ((cfg->fc_flags & RTF_REJECT) ||
1206             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1207                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1208                 /* hold loopback dev/idev if we haven't done so. */
1209                 if (dev != net->loopback_dev) {
1210                         if (dev) {
1211                                 dev_put(dev);
1212                                 in6_dev_put(idev);
1213                         }
1214                         dev = net->loopback_dev;
1215                         dev_hold(dev);
1216                         idev = in6_dev_get(dev);
1217                         if (!idev) {
1218                                 err = -ENODEV;
1219                                 goto out;
1220                         }
1221                 }
1222                 rt->dst.output = ip6_pkt_discard_out;
1223                 rt->dst.input = ip6_pkt_discard;
1224                 rt->dst.error = -ENETUNREACH;
1225                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1226                 goto install_route;
1227         }
1228
1229         if (cfg->fc_flags & RTF_GATEWAY) {
1230                 struct in6_addr *gw_addr;
1231                 int gwa_type;
1232
1233                 gw_addr = &cfg->fc_gateway;
1234                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1235                 gwa_type = ipv6_addr_type(gw_addr);
1236
1237                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1238                         struct rt6_info *grt;
1239
1240                         /* IPv6 strictly inhibits using not link-local
1241                            addresses as nexthop address.
1242                            Otherwise, router will not able to send redirects.
1243                            It is very good, but in some (rare!) circumstances
1244                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1245                            some exceptions. --ANK
1246                          */
1247                         err = -EINVAL;
1248                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1249                                 goto out;
1250
1251                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1252
1253                         err = -EHOSTUNREACH;
1254                         if (grt == NULL)
1255                                 goto out;
1256                         if (dev) {
1257                                 if (dev != grt->rt6i_dev) {
1258                                         dst_release(&grt->dst);
1259                                         goto out;
1260                                 }
1261                         } else {
1262                                 dev = grt->rt6i_dev;
1263                                 idev = grt->rt6i_idev;
1264                                 dev_hold(dev);
1265                                 in6_dev_hold(grt->rt6i_idev);
1266                         }
1267                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1268                                 err = 0;
1269                         dst_release(&grt->dst);
1270
1271                         if (err)
1272                                 goto out;
1273                 }
1274                 err = -EINVAL;
1275                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1276                         goto out;
1277         }
1278
1279         err = -ENODEV;
1280         if (dev == NULL)
1281                 goto out;
1282
1283         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1284                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1285                 if (IS_ERR(rt->rt6i_nexthop)) {
1286                         err = PTR_ERR(rt->rt6i_nexthop);
1287                         rt->rt6i_nexthop = NULL;
1288                         goto out;
1289                 }
1290         }
1291
1292         rt->rt6i_flags = cfg->fc_flags;
1293
1294 install_route:
1295         if (cfg->fc_mx) {
1296                 struct nlattr *nla;
1297                 int remaining;
1298
1299                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1300                         int type = nla_type(nla);
1301
1302                         if (type) {
1303                                 if (type > RTAX_MAX) {
1304                                         err = -EINVAL;
1305                                         goto out;
1306                                 }
1307
1308                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1309                         }
1310                 }
1311         }
1312
1313         if (dst_metric_raw(&rt->dst, RTAX_HOPLIMIT) == 0)
1314                 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
1315         if (!dst_mtu(&rt->dst))
1316                 dst_metric_set(&rt->dst, RTAX_MTU, ipv6_get_mtu(dev));
1317         if (!dst_metric(&rt->dst, RTAX_ADVMSS))
1318                 dst_metric_set(&rt->dst, RTAX_ADVMSS, ipv6_advmss(net, dst_mtu(&rt->dst)));
1319         rt->dst.dev = dev;
1320         rt->rt6i_idev = idev;
1321         rt->rt6i_table = table;
1322
1323         cfg->fc_nlinfo.nl_net = dev_net(dev);
1324
1325         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1326
1327 out:
1328         if (dev)
1329                 dev_put(dev);
1330         if (idev)
1331                 in6_dev_put(idev);
1332         if (rt)
1333                 dst_free(&rt->dst);
1334         return err;
1335 }
1336
1337 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1338 {
1339         int err;
1340         struct fib6_table *table;
1341         struct net *net = dev_net(rt->rt6i_dev);
1342
1343         if (rt == net->ipv6.ip6_null_entry)
1344                 return -ENOENT;
1345
1346         table = rt->rt6i_table;
1347         write_lock_bh(&table->tb6_lock);
1348
1349         err = fib6_del(rt, info);
1350         dst_release(&rt->dst);
1351
1352         write_unlock_bh(&table->tb6_lock);
1353
1354         return err;
1355 }
1356
1357 int ip6_del_rt(struct rt6_info *rt)
1358 {
1359         struct nl_info info = {
1360                 .nl_net = dev_net(rt->rt6i_dev),
1361         };
1362         return __ip6_del_rt(rt, &info);
1363 }
1364
1365 static int ip6_route_del(struct fib6_config *cfg)
1366 {
1367         struct fib6_table *table;
1368         struct fib6_node *fn;
1369         struct rt6_info *rt;
1370         int err = -ESRCH;
1371
1372         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1373         if (table == NULL)
1374                 return err;
1375
1376         read_lock_bh(&table->tb6_lock);
1377
1378         fn = fib6_locate(&table->tb6_root,
1379                          &cfg->fc_dst, cfg->fc_dst_len,
1380                          &cfg->fc_src, cfg->fc_src_len);
1381
1382         if (fn) {
1383                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1384                         if (cfg->fc_ifindex &&
1385                             (rt->rt6i_dev == NULL ||
1386                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1387                                 continue;
1388                         if (cfg->fc_flags & RTF_GATEWAY &&
1389                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1390                                 continue;
1391                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1392                                 continue;
1393                         dst_hold(&rt->dst);
1394                         read_unlock_bh(&table->tb6_lock);
1395
1396                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1397                 }
1398         }
1399         read_unlock_bh(&table->tb6_lock);
1400
1401         return err;
1402 }
1403
1404 /*
1405  *      Handle redirects
1406  */
1407 struct ip6rd_flowi {
1408         struct flowi fl;
1409         struct in6_addr gateway;
1410 };
1411
1412 static struct rt6_info *__ip6_route_redirect(struct net *net,
1413                                              struct fib6_table *table,
1414                                              struct flowi *fl,
1415                                              int flags)
1416 {
1417         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1418         struct rt6_info *rt;
1419         struct fib6_node *fn;
1420
1421         /*
1422          * Get the "current" route for this destination and
1423          * check if the redirect has come from approriate router.
1424          *
1425          * RFC 2461 specifies that redirects should only be
1426          * accepted if they come from the nexthop to the target.
1427          * Due to the way the routes are chosen, this notion
1428          * is a bit fuzzy and one might need to check all possible
1429          * routes.
1430          */
1431
1432         read_lock_bh(&table->tb6_lock);
1433         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1434 restart:
1435         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1436                 /*
1437                  * Current route is on-link; redirect is always invalid.
1438                  *
1439                  * Seems, previous statement is not true. It could
1440                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1441                  * But then router serving it might decide, that we should
1442                  * know truth 8)8) --ANK (980726).
1443                  */
1444                 if (rt6_check_expired(rt))
1445                         continue;
1446                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1447                         continue;
1448                 if (fl->oif != rt->rt6i_dev->ifindex)
1449                         continue;
1450                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1451                         continue;
1452                 break;
1453         }
1454
1455         if (!rt)
1456                 rt = net->ipv6.ip6_null_entry;
1457         BACKTRACK(net, &fl->fl6_src);
1458 out:
1459         dst_hold(&rt->dst);
1460
1461         read_unlock_bh(&table->tb6_lock);
1462
1463         return rt;
1464 };
1465
1466 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1467                                            struct in6_addr *src,
1468                                            struct in6_addr *gateway,
1469                                            struct net_device *dev)
1470 {
1471         int flags = RT6_LOOKUP_F_HAS_SADDR;
1472         struct net *net = dev_net(dev);
1473         struct ip6rd_flowi rdfl = {
1474                 .fl = {
1475                         .oif = dev->ifindex,
1476                         .fl6_dst = *dest,
1477                         .fl6_src = *src,
1478                 },
1479         };
1480
1481         ipv6_addr_copy(&rdfl.gateway, gateway);
1482
1483         if (rt6_need_strict(dest))
1484                 flags |= RT6_LOOKUP_F_IFACE;
1485
1486         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1487                                                    flags, __ip6_route_redirect);
1488 }
1489
1490 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1491                   struct in6_addr *saddr,
1492                   struct neighbour *neigh, u8 *lladdr, int on_link)
1493 {
1494         struct rt6_info *rt, *nrt = NULL;
1495         struct netevent_redirect netevent;
1496         struct net *net = dev_net(neigh->dev);
1497
1498         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1499
1500         if (rt == net->ipv6.ip6_null_entry) {
1501                 if (net_ratelimit())
1502                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1503                                "for redirect target\n");
1504                 goto out;
1505         }
1506
1507         /*
1508          *      We have finally decided to accept it.
1509          */
1510
1511         neigh_update(neigh, lladdr, NUD_STALE,
1512                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1513                      NEIGH_UPDATE_F_OVERRIDE|
1514                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1515                                      NEIGH_UPDATE_F_ISROUTER))
1516                      );
1517
1518         /*
1519          * Redirect received -> path was valid.
1520          * Look, redirects are sent only in response to data packets,
1521          * so that this nexthop apparently is reachable. --ANK
1522          */
1523         dst_confirm(&rt->dst);
1524
1525         /* Duplicate redirect: silently ignore. */
1526         if (neigh == rt->dst.neighbour)
1527                 goto out;
1528
1529         nrt = ip6_rt_copy(rt);
1530         if (nrt == NULL)
1531                 goto out;
1532
1533         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1534         if (on_link)
1535                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1536
1537         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1538         nrt->rt6i_dst.plen = 128;
1539         nrt->dst.flags |= DST_HOST;
1540
1541         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1542         nrt->rt6i_nexthop = neigh_clone(neigh);
1543         /* Reset pmtu, it may be better */
1544         dst_metric_set(&nrt->dst, RTAX_MTU, ipv6_get_mtu(neigh->dev));
1545         dst_metric_set(&nrt->dst, RTAX_ADVMSS, ipv6_advmss(dev_net(neigh->dev),
1546                                                            dst_mtu(&nrt->dst)));
1547
1548         if (ip6_ins_rt(nrt))
1549                 goto out;
1550
1551         netevent.old = &rt->dst;
1552         netevent.new = &nrt->dst;
1553         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1554
1555         if (rt->rt6i_flags&RTF_CACHE) {
1556                 ip6_del_rt(rt);
1557                 return;
1558         }
1559
1560 out:
1561         dst_release(&rt->dst);
1562 }
1563
1564 /*
1565  *      Handle ICMP "packet too big" messages
1566  *      i.e. Path MTU discovery
1567  */
1568
1569 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1570                              struct net *net, u32 pmtu, int ifindex)
1571 {
1572         struct rt6_info *rt, *nrt;
1573         int allfrag = 0;
1574
1575         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1576         if (rt == NULL)
1577                 return;
1578
1579         if (pmtu >= dst_mtu(&rt->dst))
1580                 goto out;
1581
1582         if (pmtu < IPV6_MIN_MTU) {
1583                 /*
1584                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1585                  * MTU (1280) and a fragment header should always be included
1586                  * after a node receiving Too Big message reporting PMTU is
1587                  * less than the IPv6 Minimum Link MTU.
1588                  */
1589                 pmtu = IPV6_MIN_MTU;
1590                 allfrag = 1;
1591         }
1592
1593         /* New mtu received -> path was valid.
1594            They are sent only in response to data packets,
1595            so that this nexthop apparently is reachable. --ANK
1596          */
1597         dst_confirm(&rt->dst);
1598
1599         /* Host route. If it is static, it would be better
1600            not to override it, but add new one, so that
1601            when cache entry will expire old pmtu
1602            would return automatically.
1603          */
1604         if (rt->rt6i_flags & RTF_CACHE) {
1605                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1606                 if (allfrag) {
1607                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1608                         features |= RTAX_FEATURE_ALLFRAG;
1609                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1610                 }
1611                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1612                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1613                 goto out;
1614         }
1615
1616         /* Network route.
1617            Two cases are possible:
1618            1. It is connected route. Action: COW
1619            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1620          */
1621         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1622                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1623         else
1624                 nrt = rt6_alloc_clone(rt, daddr);
1625
1626         if (nrt) {
1627                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1628                 if (allfrag) {
1629                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1630                         features |= RTAX_FEATURE_ALLFRAG;
1631                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1632                 }
1633
1634                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1635                  * happened within 5 mins, the recommended timer is 10 mins.
1636                  * Here this route expiration time is set to ip6_rt_mtu_expires
1637                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1638                  * and detecting PMTU increase will be automatically happened.
1639                  */
1640                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1641                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1642
1643                 ip6_ins_rt(nrt);
1644         }
1645 out:
1646         dst_release(&rt->dst);
1647 }
1648
1649 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1650                         struct net_device *dev, u32 pmtu)
1651 {
1652         struct net *net = dev_net(dev);
1653
1654         /*
1655          * RFC 1981 states that a node "MUST reduce the size of the packets it
1656          * is sending along the path" that caused the Packet Too Big message.
1657          * Since it's not possible in the general case to determine which
1658          * interface was used to send the original packet, we update the MTU
1659          * on the interface that will be used to send future packets. We also
1660          * update the MTU on the interface that received the Packet Too Big in
1661          * case the original packet was forced out that interface with
1662          * SO_BINDTODEVICE or similar. This is the next best thing to the
1663          * correct behaviour, which would be to update the MTU on all
1664          * interfaces.
1665          */
1666         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1667         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1668 }
1669
1670 /*
1671  *      Misc support functions
1672  */
1673
1674 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1675 {
1676         struct net *net = dev_net(ort->rt6i_dev);
1677         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1678
1679         if (rt) {
1680                 rt->dst.input = ort->dst.input;
1681                 rt->dst.output = ort->dst.output;
1682
1683                 dst_copy_metrics(&rt->dst, &ort->dst);
1684                 rt->dst.error = ort->dst.error;
1685                 rt->dst.dev = ort->dst.dev;
1686                 if (rt->dst.dev)
1687                         dev_hold(rt->dst.dev);
1688                 rt->rt6i_idev = ort->rt6i_idev;
1689                 if (rt->rt6i_idev)
1690                         in6_dev_hold(rt->rt6i_idev);
1691                 rt->dst.lastuse = jiffies;
1692                 rt->rt6i_expires = 0;
1693
1694                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1695                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1696                 rt->rt6i_metric = 0;
1697
1698                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1699 #ifdef CONFIG_IPV6_SUBTREES
1700                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1701 #endif
1702                 rt->rt6i_table = ort->rt6i_table;
1703         }
1704         return rt;
1705 }
1706
1707 #ifdef CONFIG_IPV6_ROUTE_INFO
1708 static struct rt6_info *rt6_get_route_info(struct net *net,
1709                                            struct in6_addr *prefix, int prefixlen,
1710                                            struct in6_addr *gwaddr, int ifindex)
1711 {
1712         struct fib6_node *fn;
1713         struct rt6_info *rt = NULL;
1714         struct fib6_table *table;
1715
1716         table = fib6_get_table(net, RT6_TABLE_INFO);
1717         if (table == NULL)
1718                 return NULL;
1719
1720         write_lock_bh(&table->tb6_lock);
1721         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1722         if (!fn)
1723                 goto out;
1724
1725         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1726                 if (rt->rt6i_dev->ifindex != ifindex)
1727                         continue;
1728                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1729                         continue;
1730                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1731                         continue;
1732                 dst_hold(&rt->dst);
1733                 break;
1734         }
1735 out:
1736         write_unlock_bh(&table->tb6_lock);
1737         return rt;
1738 }
1739
1740 static struct rt6_info *rt6_add_route_info(struct net *net,
1741                                            struct in6_addr *prefix, int prefixlen,
1742                                            struct in6_addr *gwaddr, int ifindex,
1743                                            unsigned pref)
1744 {
1745         struct fib6_config cfg = {
1746                 .fc_table       = RT6_TABLE_INFO,
1747                 .fc_metric      = IP6_RT_PRIO_USER,
1748                 .fc_ifindex     = ifindex,
1749                 .fc_dst_len     = prefixlen,
1750                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1751                                   RTF_UP | RTF_PREF(pref),
1752                 .fc_nlinfo.pid = 0,
1753                 .fc_nlinfo.nlh = NULL,
1754                 .fc_nlinfo.nl_net = net,
1755         };
1756
1757         ipv6_addr_copy(&cfg.fc_dst, prefix);
1758         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1759
1760         /* We should treat it as a default route if prefix length is 0. */
1761         if (!prefixlen)
1762                 cfg.fc_flags |= RTF_DEFAULT;
1763
1764         ip6_route_add(&cfg);
1765
1766         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1767 }
1768 #endif
1769
1770 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1771 {
1772         struct rt6_info *rt;
1773         struct fib6_table *table;
1774
1775         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1776         if (table == NULL)
1777                 return NULL;
1778
1779         write_lock_bh(&table->tb6_lock);
1780         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1781                 if (dev == rt->rt6i_dev &&
1782                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1783                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1784                         break;
1785         }
1786         if (rt)
1787                 dst_hold(&rt->dst);
1788         write_unlock_bh(&table->tb6_lock);
1789         return rt;
1790 }
1791
1792 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1793                                      struct net_device *dev,
1794                                      unsigned int pref)
1795 {
1796         struct fib6_config cfg = {
1797                 .fc_table       = RT6_TABLE_DFLT,
1798                 .fc_metric      = IP6_RT_PRIO_USER,
1799                 .fc_ifindex     = dev->ifindex,
1800                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1801                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1802                 .fc_nlinfo.pid = 0,
1803                 .fc_nlinfo.nlh = NULL,
1804                 .fc_nlinfo.nl_net = dev_net(dev),
1805         };
1806
1807         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1808
1809         ip6_route_add(&cfg);
1810
1811         return rt6_get_dflt_router(gwaddr, dev);
1812 }
1813
1814 void rt6_purge_dflt_routers(struct net *net)
1815 {
1816         struct rt6_info *rt;
1817         struct fib6_table *table;
1818
1819         /* NOTE: Keep consistent with rt6_get_dflt_router */
1820         table = fib6_get_table(net, RT6_TABLE_DFLT);
1821         if (table == NULL)
1822                 return;
1823
1824 restart:
1825         read_lock_bh(&table->tb6_lock);
1826         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1827                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1828                         dst_hold(&rt->dst);
1829                         read_unlock_bh(&table->tb6_lock);
1830                         ip6_del_rt(rt);
1831                         goto restart;
1832                 }
1833         }
1834         read_unlock_bh(&table->tb6_lock);
1835 }
1836
1837 static void rtmsg_to_fib6_config(struct net *net,
1838                                  struct in6_rtmsg *rtmsg,
1839                                  struct fib6_config *cfg)
1840 {
1841         memset(cfg, 0, sizeof(*cfg));
1842
1843         cfg->fc_table = RT6_TABLE_MAIN;
1844         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1845         cfg->fc_metric = rtmsg->rtmsg_metric;
1846         cfg->fc_expires = rtmsg->rtmsg_info;
1847         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1848         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1849         cfg->fc_flags = rtmsg->rtmsg_flags;
1850
1851         cfg->fc_nlinfo.nl_net = net;
1852
1853         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1854         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1855         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1856 }
1857
1858 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1859 {
1860         struct fib6_config cfg;
1861         struct in6_rtmsg rtmsg;
1862         int err;
1863
1864         switch(cmd) {
1865         case SIOCADDRT:         /* Add a route */
1866         case SIOCDELRT:         /* Delete a route */
1867                 if (!capable(CAP_NET_ADMIN))
1868                         return -EPERM;
1869                 err = copy_from_user(&rtmsg, arg,
1870                                      sizeof(struct in6_rtmsg));
1871                 if (err)
1872                         return -EFAULT;
1873
1874                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1875
1876                 rtnl_lock();
1877                 switch (cmd) {
1878                 case SIOCADDRT:
1879                         err = ip6_route_add(&cfg);
1880                         break;
1881                 case SIOCDELRT:
1882                         err = ip6_route_del(&cfg);
1883                         break;
1884                 default:
1885                         err = -EINVAL;
1886                 }
1887                 rtnl_unlock();
1888
1889                 return err;
1890         }
1891
1892         return -EINVAL;
1893 }
1894
1895 /*
1896  *      Drop the packet on the floor
1897  */
1898
1899 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1900 {
1901         int type;
1902         struct dst_entry *dst = skb_dst(skb);
1903         switch (ipstats_mib_noroutes) {
1904         case IPSTATS_MIB_INNOROUTES:
1905                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1906                 if (type == IPV6_ADDR_ANY) {
1907                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1908                                       IPSTATS_MIB_INADDRERRORS);
1909                         break;
1910                 }
1911                 /* FALLTHROUGH */
1912         case IPSTATS_MIB_OUTNOROUTES:
1913                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1914                               ipstats_mib_noroutes);
1915                 break;
1916         }
1917         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1918         kfree_skb(skb);
1919         return 0;
1920 }
1921
1922 static int ip6_pkt_discard(struct sk_buff *skb)
1923 {
1924         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1925 }
1926
1927 static int ip6_pkt_discard_out(struct sk_buff *skb)
1928 {
1929         skb->dev = skb_dst(skb)->dev;
1930         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1931 }
1932
1933 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1934
1935 static int ip6_pkt_prohibit(struct sk_buff *skb)
1936 {
1937         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1938 }
1939
1940 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1941 {
1942         skb->dev = skb_dst(skb)->dev;
1943         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1944 }
1945
1946 #endif
1947
1948 /*
1949  *      Allocate a dst for local (unicast / anycast) address.
1950  */
1951
1952 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1953                                     const struct in6_addr *addr,
1954                                     int anycast)
1955 {
1956         struct net *net = dev_net(idev->dev);
1957         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1958         struct neighbour *neigh;
1959
1960         if (rt == NULL) {
1961                 if (net_ratelimit())
1962                         pr_warning("IPv6:  Maximum number of routes reached,"
1963                                    " consider increasing route/max_size.\n");
1964                 return ERR_PTR(-ENOMEM);
1965         }
1966
1967         dev_hold(net->loopback_dev);
1968         in6_dev_hold(idev);
1969
1970         rt->dst.flags = DST_HOST;
1971         rt->dst.input = ip6_input;
1972         rt->dst.output = ip6_output;
1973         rt->rt6i_dev = net->loopback_dev;
1974         rt->rt6i_idev = idev;
1975         dst_metric_set(&rt->dst, RTAX_MTU, ipv6_get_mtu(rt->rt6i_dev));
1976         dst_metric_set(&rt->dst, RTAX_ADVMSS, ipv6_advmss(net, dst_mtu(&rt->dst)));
1977         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
1978         rt->dst.obsolete = -1;
1979
1980         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1981         if (anycast)
1982                 rt->rt6i_flags |= RTF_ANYCAST;
1983         else
1984                 rt->rt6i_flags |= RTF_LOCAL;
1985         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1986         if (IS_ERR(neigh)) {
1987                 dst_free(&rt->dst);
1988
1989                 /* We are casting this because that is the return
1990                  * value type.  But an errno encoded pointer is the
1991                  * same regardless of the underlying pointer type,
1992                  * and that's what we are returning.  So this is OK.
1993                  */
1994                 return (struct rt6_info *) neigh;
1995         }
1996         rt->rt6i_nexthop = neigh;
1997
1998         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1999         rt->rt6i_dst.plen = 128;
2000         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2001
2002         atomic_set(&rt->dst.__refcnt, 1);
2003
2004         return rt;
2005 }
2006
2007 struct arg_dev_net {
2008         struct net_device *dev;
2009         struct net *net;
2010 };
2011
2012 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2013 {
2014         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
2015         struct net *net = ((struct arg_dev_net *)arg)->net;
2016
2017         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2018             rt != net->ipv6.ip6_null_entry) {
2019                 RT6_TRACE("deleted by ifdown %p\n", rt);
2020                 return -1;
2021         }
2022         return 0;
2023 }
2024
2025 void rt6_ifdown(struct net *net, struct net_device *dev)
2026 {
2027         struct arg_dev_net adn = {
2028                 .dev = dev,
2029                 .net = net,
2030         };
2031
2032         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2033         icmp6_clean_all(fib6_ifdown, &adn);
2034 }
2035
2036 struct rt6_mtu_change_arg
2037 {
2038         struct net_device *dev;
2039         unsigned mtu;
2040 };
2041
2042 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2043 {
2044         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2045         struct inet6_dev *idev;
2046         struct net *net = dev_net(arg->dev);
2047
2048         /* In IPv6 pmtu discovery is not optional,
2049            so that RTAX_MTU lock cannot disable it.
2050            We still use this lock to block changes
2051            caused by addrconf/ndisc.
2052         */
2053
2054         idev = __in6_dev_get(arg->dev);
2055         if (idev == NULL)
2056                 return 0;
2057
2058         /* For administrative MTU increase, there is no way to discover
2059            IPv6 PMTU increase, so PMTU increase should be updated here.
2060            Since RFC 1981 doesn't include administrative MTU increase
2061            update PMTU increase is a MUST. (i.e. jumbo frame)
2062          */
2063         /*
2064            If new MTU is less than route PMTU, this new MTU will be the
2065            lowest MTU in the path, update the route PMTU to reflect PMTU
2066            decreases; if new MTU is greater than route PMTU, and the
2067            old MTU is the lowest MTU in the path, update the route PMTU
2068            to reflect the increase. In this case if the other nodes' MTU
2069            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2070            PMTU discouvery.
2071          */
2072         if (rt->rt6i_dev == arg->dev &&
2073             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2074             (dst_mtu(&rt->dst) >= arg->mtu ||
2075              (dst_mtu(&rt->dst) < arg->mtu &&
2076               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2077                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2078                 dst_metric_set(&rt->dst, RTAX_ADVMSS, ipv6_advmss(net, arg->mtu));
2079         }
2080         return 0;
2081 }
2082
2083 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2084 {
2085         struct rt6_mtu_change_arg arg = {
2086                 .dev = dev,
2087                 .mtu = mtu,
2088         };
2089
2090         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2091 }
2092
2093 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2094         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2095         [RTA_OIF]               = { .type = NLA_U32 },
2096         [RTA_IIF]               = { .type = NLA_U32 },
2097         [RTA_PRIORITY]          = { .type = NLA_U32 },
2098         [RTA_METRICS]           = { .type = NLA_NESTED },
2099 };
2100
2101 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2102                               struct fib6_config *cfg)
2103 {
2104         struct rtmsg *rtm;
2105         struct nlattr *tb[RTA_MAX+1];
2106         int err;
2107
2108         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2109         if (err < 0)
2110                 goto errout;
2111
2112         err = -EINVAL;
2113         rtm = nlmsg_data(nlh);
2114         memset(cfg, 0, sizeof(*cfg));
2115
2116         cfg->fc_table = rtm->rtm_table;
2117         cfg->fc_dst_len = rtm->rtm_dst_len;
2118         cfg->fc_src_len = rtm->rtm_src_len;
2119         cfg->fc_flags = RTF_UP;
2120         cfg->fc_protocol = rtm->rtm_protocol;
2121
2122         if (rtm->rtm_type == RTN_UNREACHABLE)
2123                 cfg->fc_flags |= RTF_REJECT;
2124
2125         if (rtm->rtm_type == RTN_LOCAL)
2126                 cfg->fc_flags |= RTF_LOCAL;
2127
2128         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2129         cfg->fc_nlinfo.nlh = nlh;
2130         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2131
2132         if (tb[RTA_GATEWAY]) {
2133                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2134                 cfg->fc_flags |= RTF_GATEWAY;
2135         }
2136
2137         if (tb[RTA_DST]) {
2138                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2139
2140                 if (nla_len(tb[RTA_DST]) < plen)
2141                         goto errout;
2142
2143                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2144         }
2145
2146         if (tb[RTA_SRC]) {
2147                 int plen = (rtm->rtm_src_len + 7) >> 3;
2148
2149                 if (nla_len(tb[RTA_SRC]) < plen)
2150                         goto errout;
2151
2152                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2153         }
2154
2155         if (tb[RTA_OIF])
2156                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2157
2158         if (tb[RTA_PRIORITY])
2159                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2160
2161         if (tb[RTA_METRICS]) {
2162                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2163                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2164         }
2165
2166         if (tb[RTA_TABLE])
2167                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2168
2169         err = 0;
2170 errout:
2171         return err;
2172 }
2173
2174 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2175 {
2176         struct fib6_config cfg;
2177         int err;
2178
2179         err = rtm_to_fib6_config(skb, nlh, &cfg);
2180         if (err < 0)
2181                 return err;
2182
2183         return ip6_route_del(&cfg);
2184 }
2185
2186 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2187 {
2188         struct fib6_config cfg;
2189         int err;
2190
2191         err = rtm_to_fib6_config(skb, nlh, &cfg);
2192         if (err < 0)
2193                 return err;
2194
2195         return ip6_route_add(&cfg);
2196 }
2197
2198 static inline size_t rt6_nlmsg_size(void)
2199 {
2200         return NLMSG_ALIGN(sizeof(struct rtmsg))
2201                + nla_total_size(16) /* RTA_SRC */
2202                + nla_total_size(16) /* RTA_DST */
2203                + nla_total_size(16) /* RTA_GATEWAY */
2204                + nla_total_size(16) /* RTA_PREFSRC */
2205                + nla_total_size(4) /* RTA_TABLE */
2206                + nla_total_size(4) /* RTA_IIF */
2207                + nla_total_size(4) /* RTA_OIF */
2208                + nla_total_size(4) /* RTA_PRIORITY */
2209                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2210                + nla_total_size(sizeof(struct rta_cacheinfo));
2211 }
2212
2213 static int rt6_fill_node(struct net *net,
2214                          struct sk_buff *skb, struct rt6_info *rt,
2215                          struct in6_addr *dst, struct in6_addr *src,
2216                          int iif, int type, u32 pid, u32 seq,
2217                          int prefix, int nowait, unsigned int flags)
2218 {
2219         struct rtmsg *rtm;
2220         struct nlmsghdr *nlh;
2221         long expires;
2222         u32 table;
2223
2224         if (prefix) {   /* user wants prefix routes only */
2225                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2226                         /* success since this is not a prefix route */
2227                         return 1;
2228                 }
2229         }
2230
2231         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2232         if (nlh == NULL)
2233                 return -EMSGSIZE;
2234
2235         rtm = nlmsg_data(nlh);
2236         rtm->rtm_family = AF_INET6;
2237         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2238         rtm->rtm_src_len = rt->rt6i_src.plen;
2239         rtm->rtm_tos = 0;
2240         if (rt->rt6i_table)
2241                 table = rt->rt6i_table->tb6_id;
2242         else
2243                 table = RT6_TABLE_UNSPEC;
2244         rtm->rtm_table = table;
2245         NLA_PUT_U32(skb, RTA_TABLE, table);
2246         if (rt->rt6i_flags&RTF_REJECT)
2247                 rtm->rtm_type = RTN_UNREACHABLE;
2248         else if (rt->rt6i_flags&RTF_LOCAL)
2249                 rtm->rtm_type = RTN_LOCAL;
2250         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2251                 rtm->rtm_type = RTN_LOCAL;
2252         else
2253                 rtm->rtm_type = RTN_UNICAST;
2254         rtm->rtm_flags = 0;
2255         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2256         rtm->rtm_protocol = rt->rt6i_protocol;
2257         if (rt->rt6i_flags&RTF_DYNAMIC)
2258                 rtm->rtm_protocol = RTPROT_REDIRECT;
2259         else if (rt->rt6i_flags & RTF_ADDRCONF)
2260                 rtm->rtm_protocol = RTPROT_KERNEL;
2261         else if (rt->rt6i_flags&RTF_DEFAULT)
2262                 rtm->rtm_protocol = RTPROT_RA;
2263
2264         if (rt->rt6i_flags&RTF_CACHE)
2265                 rtm->rtm_flags |= RTM_F_CLONED;
2266
2267         if (dst) {
2268                 NLA_PUT(skb, RTA_DST, 16, dst);
2269                 rtm->rtm_dst_len = 128;
2270         } else if (rtm->rtm_dst_len)
2271                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2272 #ifdef CONFIG_IPV6_SUBTREES
2273         if (src) {
2274                 NLA_PUT(skb, RTA_SRC, 16, src);
2275                 rtm->rtm_src_len = 128;
2276         } else if (rtm->rtm_src_len)
2277                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2278 #endif
2279         if (iif) {
2280 #ifdef CONFIG_IPV6_MROUTE
2281                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2282                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2283                         if (err <= 0) {
2284                                 if (!nowait) {
2285                                         if (err == 0)
2286                                                 return 0;
2287                                         goto nla_put_failure;
2288                                 } else {
2289                                         if (err == -EMSGSIZE)
2290                                                 goto nla_put_failure;
2291                                 }
2292                         }
2293                 } else
2294 #endif
2295                         NLA_PUT_U32(skb, RTA_IIF, iif);
2296         } else if (dst) {
2297                 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2298                 struct in6_addr saddr_buf;
2299                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2300                                        dst, 0, &saddr_buf) == 0)
2301                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2302         }
2303
2304         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2305                 goto nla_put_failure;
2306
2307         if (rt->dst.neighbour)
2308                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2309
2310         if (rt->dst.dev)
2311                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2312
2313         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2314
2315         if (!(rt->rt6i_flags & RTF_EXPIRES))
2316                 expires = 0;
2317         else if (rt->rt6i_expires - jiffies < INT_MAX)
2318                 expires = rt->rt6i_expires - jiffies;
2319         else
2320                 expires = INT_MAX;
2321
2322         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2323                                expires, rt->dst.error) < 0)
2324                 goto nla_put_failure;
2325
2326         return nlmsg_end(skb, nlh);
2327
2328 nla_put_failure:
2329         nlmsg_cancel(skb, nlh);
2330         return -EMSGSIZE;
2331 }
2332
2333 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2334 {
2335         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2336         int prefix;
2337
2338         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2339                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2340                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2341         } else
2342                 prefix = 0;
2343
2344         return rt6_fill_node(arg->net,
2345                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2346                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2347                      prefix, 0, NLM_F_MULTI);
2348 }
2349
2350 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2351 {
2352         struct net *net = sock_net(in_skb->sk);
2353         struct nlattr *tb[RTA_MAX+1];
2354         struct rt6_info *rt;
2355         struct sk_buff *skb;
2356         struct rtmsg *rtm;
2357         struct flowi fl;
2358         int err, iif = 0;
2359
2360         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2361         if (err < 0)
2362                 goto errout;
2363
2364         err = -EINVAL;
2365         memset(&fl, 0, sizeof(fl));
2366
2367         if (tb[RTA_SRC]) {
2368                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2369                         goto errout;
2370
2371                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2372         }
2373
2374         if (tb[RTA_DST]) {
2375                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2376                         goto errout;
2377
2378                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2379         }
2380
2381         if (tb[RTA_IIF])
2382                 iif = nla_get_u32(tb[RTA_IIF]);
2383
2384         if (tb[RTA_OIF])
2385                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2386
2387         if (iif) {
2388                 struct net_device *dev;
2389                 dev = __dev_get_by_index(net, iif);
2390                 if (!dev) {
2391                         err = -ENODEV;
2392                         goto errout;
2393                 }
2394         }
2395
2396         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2397         if (skb == NULL) {
2398                 err = -ENOBUFS;
2399                 goto errout;
2400         }
2401
2402         /* Reserve room for dummy headers, this skb can pass
2403            through good chunk of routing engine.
2404          */
2405         skb_reset_mac_header(skb);
2406         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2407
2408         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2409         skb_dst_set(skb, &rt->dst);
2410
2411         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2412                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2413                             nlh->nlmsg_seq, 0, 0, 0);
2414         if (err < 0) {
2415                 kfree_skb(skb);
2416                 goto errout;
2417         }
2418
2419         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2420 errout:
2421         return err;
2422 }
2423
2424 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2425 {
2426         struct sk_buff *skb;
2427         struct net *net = info->nl_net;
2428         u32 seq;
2429         int err;
2430
2431         err = -ENOBUFS;
2432         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2433
2434         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2435         if (skb == NULL)
2436                 goto errout;
2437
2438         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2439                                 event, info->pid, seq, 0, 0, 0);
2440         if (err < 0) {
2441                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2442                 WARN_ON(err == -EMSGSIZE);
2443                 kfree_skb(skb);
2444                 goto errout;
2445         }
2446         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2447                     info->nlh, gfp_any());
2448         return;
2449 errout:
2450         if (err < 0)
2451                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2452 }
2453
2454 static int ip6_route_dev_notify(struct notifier_block *this,
2455                                 unsigned long event, void *data)
2456 {
2457         struct net_device *dev = (struct net_device *)data;
2458         struct net *net = dev_net(dev);
2459
2460         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2461                 net->ipv6.ip6_null_entry->dst.dev = dev;
2462                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2463 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2464                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2465                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2466                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2467                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2468 #endif
2469         }
2470
2471         return NOTIFY_OK;
2472 }
2473
2474 /*
2475  *      /proc
2476  */
2477
2478 #ifdef CONFIG_PROC_FS
2479
2480 struct rt6_proc_arg
2481 {
2482         char *buffer;
2483         int offset;
2484         int length;
2485         int skip;
2486         int len;
2487 };
2488
2489 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2490 {
2491         struct seq_file *m = p_arg;
2492
2493         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2494
2495 #ifdef CONFIG_IPV6_SUBTREES
2496         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2497 #else
2498         seq_puts(m, "00000000000000000000000000000000 00 ");
2499 #endif
2500
2501         if (rt->rt6i_nexthop) {
2502                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2503         } else {
2504                 seq_puts(m, "00000000000000000000000000000000");
2505         }
2506         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2507                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2508                    rt->dst.__use, rt->rt6i_flags,
2509                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2510         return 0;
2511 }
2512
2513 static int ipv6_route_show(struct seq_file *m, void *v)
2514 {
2515         struct net *net = (struct net *)m->private;
2516         fib6_clean_all(net, rt6_info_route, 0, m);
2517         return 0;
2518 }
2519
2520 static int ipv6_route_open(struct inode *inode, struct file *file)
2521 {
2522         return single_open_net(inode, file, ipv6_route_show);
2523 }
2524
2525 static const struct file_operations ipv6_route_proc_fops = {
2526         .owner          = THIS_MODULE,
2527         .open           = ipv6_route_open,
2528         .read           = seq_read,
2529         .llseek         = seq_lseek,
2530         .release        = single_release_net,
2531 };
2532
2533 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2534 {
2535         struct net *net = (struct net *)seq->private;
2536         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2537                    net->ipv6.rt6_stats->fib_nodes,
2538                    net->ipv6.rt6_stats->fib_route_nodes,
2539                    net->ipv6.rt6_stats->fib_rt_alloc,
2540                    net->ipv6.rt6_stats->fib_rt_entries,
2541                    net->ipv6.rt6_stats->fib_rt_cache,
2542                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2543                    net->ipv6.rt6_stats->fib_discarded_routes);
2544
2545         return 0;
2546 }
2547
2548 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2549 {
2550         return single_open_net(inode, file, rt6_stats_seq_show);
2551 }
2552
2553 static const struct file_operations rt6_stats_seq_fops = {
2554         .owner   = THIS_MODULE,
2555         .open    = rt6_stats_seq_open,
2556         .read    = seq_read,
2557         .llseek  = seq_lseek,
2558         .release = single_release_net,
2559 };
2560 #endif  /* CONFIG_PROC_FS */
2561
2562 #ifdef CONFIG_SYSCTL
2563
2564 static
2565 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2566                               void __user *buffer, size_t *lenp, loff_t *ppos)
2567 {
2568         struct net *net = current->nsproxy->net_ns;
2569         int delay = net->ipv6.sysctl.flush_delay;
2570         if (write) {
2571                 proc_dointvec(ctl, write, buffer, lenp, ppos);
2572                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2573                 return 0;
2574         } else
2575                 return -EINVAL;
2576 }
2577
2578 ctl_table ipv6_route_table_template[] = {
2579         {
2580                 .procname       =       "flush",
2581                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2582                 .maxlen         =       sizeof(int),
2583                 .mode           =       0200,
2584                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2585         },
2586         {
2587                 .procname       =       "gc_thresh",
2588                 .data           =       &ip6_dst_ops_template.gc_thresh,
2589                 .maxlen         =       sizeof(int),
2590                 .mode           =       0644,
2591                 .proc_handler   =       proc_dointvec,
2592         },
2593         {
2594                 .procname       =       "max_size",
2595                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2596                 .maxlen         =       sizeof(int),
2597                 .mode           =       0644,
2598                 .proc_handler   =       proc_dointvec,
2599         },
2600         {
2601                 .procname       =       "gc_min_interval",
2602                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2603                 .maxlen         =       sizeof(int),
2604                 .mode           =       0644,
2605                 .proc_handler   =       proc_dointvec_jiffies,
2606         },
2607         {
2608                 .procname       =       "gc_timeout",
2609                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2610                 .maxlen         =       sizeof(int),
2611                 .mode           =       0644,
2612                 .proc_handler   =       proc_dointvec_jiffies,
2613         },
2614         {
2615                 .procname       =       "gc_interval",
2616                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2617                 .maxlen         =       sizeof(int),
2618                 .mode           =       0644,
2619                 .proc_handler   =       proc_dointvec_jiffies,
2620         },
2621         {
2622                 .procname       =       "gc_elasticity",
2623                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2624                 .maxlen         =       sizeof(int),
2625                 .mode           =       0644,
2626                 .proc_handler   =       proc_dointvec,
2627         },
2628         {
2629                 .procname       =       "mtu_expires",
2630                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2631                 .maxlen         =       sizeof(int),
2632                 .mode           =       0644,
2633                 .proc_handler   =       proc_dointvec_jiffies,
2634         },
2635         {
2636                 .procname       =       "min_adv_mss",
2637                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2638                 .maxlen         =       sizeof(int),
2639                 .mode           =       0644,
2640                 .proc_handler   =       proc_dointvec,
2641         },
2642         {
2643                 .procname       =       "gc_min_interval_ms",
2644                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2645                 .maxlen         =       sizeof(int),
2646                 .mode           =       0644,
2647                 .proc_handler   =       proc_dointvec_ms_jiffies,
2648         },
2649         { }
2650 };
2651
2652 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2653 {
2654         struct ctl_table *table;
2655
2656         table = kmemdup(ipv6_route_table_template,
2657                         sizeof(ipv6_route_table_template),
2658                         GFP_KERNEL);
2659
2660         if (table) {
2661                 table[0].data = &net->ipv6.sysctl.flush_delay;
2662                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2663                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2664                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2665                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2666                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2667                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2668                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2669                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2670                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2671         }
2672
2673         return table;
2674 }
2675 #endif
2676
2677 static int __net_init ip6_route_net_init(struct net *net)
2678 {
2679         int ret = -ENOMEM;
2680
2681         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2682                sizeof(net->ipv6.ip6_dst_ops));
2683
2684         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2685                 goto out_ip6_dst_ops;
2686
2687         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2688                                            sizeof(*net->ipv6.ip6_null_entry),
2689                                            GFP_KERNEL);
2690         if (!net->ipv6.ip6_null_entry)
2691                 goto out_ip6_dst_entries;
2692         net->ipv6.ip6_null_entry->dst.path =
2693                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2694         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2695         dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255);
2696
2697 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2698         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2699                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2700                                                GFP_KERNEL);
2701         if (!net->ipv6.ip6_prohibit_entry)
2702                 goto out_ip6_null_entry;
2703         net->ipv6.ip6_prohibit_entry->dst.path =
2704                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2705         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2706         dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255);
2707
2708         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2709                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2710                                                GFP_KERNEL);
2711         if (!net->ipv6.ip6_blk_hole_entry)
2712                 goto out_ip6_prohibit_entry;
2713         net->ipv6.ip6_blk_hole_entry->dst.path =
2714                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2715         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2716         dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255);
2717 #endif
2718
2719         net->ipv6.sysctl.flush_delay = 0;
2720         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2721         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2722         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2723         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2724         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2725         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2726         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2727
2728 #ifdef CONFIG_PROC_FS
2729         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2730         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2731 #endif
2732         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2733
2734         ret = 0;
2735 out:
2736         return ret;
2737
2738 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2739 out_ip6_prohibit_entry:
2740         kfree(net->ipv6.ip6_prohibit_entry);
2741 out_ip6_null_entry:
2742         kfree(net->ipv6.ip6_null_entry);
2743 #endif
2744 out_ip6_dst_entries:
2745         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2746 out_ip6_dst_ops:
2747         goto out;
2748 }
2749
2750 static void __net_exit ip6_route_net_exit(struct net *net)
2751 {
2752 #ifdef CONFIG_PROC_FS
2753         proc_net_remove(net, "ipv6_route");
2754         proc_net_remove(net, "rt6_stats");
2755 #endif
2756         kfree(net->ipv6.ip6_null_entry);
2757 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2758         kfree(net->ipv6.ip6_prohibit_entry);
2759         kfree(net->ipv6.ip6_blk_hole_entry);
2760 #endif
2761         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2762 }
2763
2764 static struct pernet_operations ip6_route_net_ops = {
2765         .init = ip6_route_net_init,
2766         .exit = ip6_route_net_exit,
2767 };
2768
2769 static struct notifier_block ip6_route_dev_notifier = {
2770         .notifier_call = ip6_route_dev_notify,
2771         .priority = 0,
2772 };
2773
2774 int __init ip6_route_init(void)
2775 {
2776         int ret;
2777
2778         ret = -ENOMEM;
2779         ip6_dst_ops_template.kmem_cachep =
2780                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2781                                   SLAB_HWCACHE_ALIGN, NULL);
2782         if (!ip6_dst_ops_template.kmem_cachep)
2783                 goto out;
2784
2785         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2786         if (ret)
2787                 goto out_kmem_cache;
2788
2789         ret = register_pernet_subsys(&ip6_route_net_ops);
2790         if (ret)
2791                 goto out_dst_entries;
2792
2793         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2794
2795         /* Registering of the loopback is done before this portion of code,
2796          * the loopback reference in rt6_info will not be taken, do it
2797          * manually for init_net */
2798         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2799         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2800   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2801         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2802         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2803         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2804         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2805   #endif
2806         ret = fib6_init();
2807         if (ret)
2808                 goto out_register_subsys;
2809
2810         ret = xfrm6_init();
2811         if (ret)
2812                 goto out_fib6_init;
2813
2814         ret = fib6_rules_init();
2815         if (ret)
2816                 goto xfrm6_init;
2817
2818         ret = -ENOBUFS;
2819         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2820             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2821             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2822                 goto fib6_rules_init;
2823
2824         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2825         if (ret)
2826                 goto fib6_rules_init;
2827
2828 out:
2829         return ret;
2830
2831 fib6_rules_init:
2832         fib6_rules_cleanup();
2833 xfrm6_init:
2834         xfrm6_fini();
2835 out_fib6_init:
2836         fib6_gc_cleanup();
2837 out_register_subsys:
2838         unregister_pernet_subsys(&ip6_route_net_ops);
2839 out_dst_entries:
2840         dst_entries_destroy(&ip6_dst_blackhole_ops);
2841 out_kmem_cache:
2842         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2843         goto out;
2844 }
2845
2846 void ip6_route_cleanup(void)
2847 {
2848         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2849         fib6_rules_cleanup();
2850         xfrm6_fini();
2851         fib6_gc_cleanup();
2852         unregister_pernet_subsys(&ip6_route_net_ops);
2853         dst_entries_destroy(&ip6_dst_blackhole_ops);
2854         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2855 }