]> git.karo-electronics.de Git - mv-sheeva.git/blob - net/ipv6/route.c
net: Abstract away all dst_entry metrics accesses.
[mv-sheeva.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static struct dst_ops ip6_dst_ops_template = {
101         .family                 =       AF_INET6,
102         .protocol               =       cpu_to_be16(ETH_P_IPV6),
103         .gc                     =       ip6_dst_gc,
104         .gc_thresh              =       1024,
105         .check                  =       ip6_dst_check,
106         .destroy                =       ip6_dst_destroy,
107         .ifdown                 =       ip6_dst_ifdown,
108         .negative_advice        =       ip6_negative_advice,
109         .link_failure           =       ip6_link_failure,
110         .update_pmtu            =       ip6_rt_update_pmtu,
111         .local_out              =       __ip6_local_out,
112 };
113
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
115 {
116 }
117
118 static struct dst_ops ip6_dst_blackhole_ops = {
119         .family                 =       AF_INET6,
120         .protocol               =       cpu_to_be16(ETH_P_IPV6),
121         .destroy                =       ip6_dst_destroy,
122         .check                  =       ip6_dst_check,
123         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
124 };
125
126 static struct rt6_info ip6_null_entry_template = {
127         .dst = {
128                 .__refcnt       = ATOMIC_INIT(1),
129                 .__use          = 1,
130                 .obsolete       = -1,
131                 .error          = -ENETUNREACH,
132                 .input          = ip6_pkt_discard,
133                 .output         = ip6_pkt_discard_out,
134         },
135         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
136         .rt6i_protocol  = RTPROT_KERNEL,
137         .rt6i_metric    = ~(u32) 0,
138         .rt6i_ref       = ATOMIC_INIT(1),
139 };
140
141 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
142
143 static int ip6_pkt_prohibit(struct sk_buff *skb);
144 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
145
146 static struct rt6_info ip6_prohibit_entry_template = {
147         .dst = {
148                 .__refcnt       = ATOMIC_INIT(1),
149                 .__use          = 1,
150                 .obsolete       = -1,
151                 .error          = -EACCES,
152                 .input          = ip6_pkt_prohibit,
153                 .output         = ip6_pkt_prohibit_out,
154         },
155         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
156         .rt6i_protocol  = RTPROT_KERNEL,
157         .rt6i_metric    = ~(u32) 0,
158         .rt6i_ref       = ATOMIC_INIT(1),
159 };
160
161 static struct rt6_info ip6_blk_hole_entry_template = {
162         .dst = {
163                 .__refcnt       = ATOMIC_INIT(1),
164                 .__use          = 1,
165                 .obsolete       = -1,
166                 .error          = -EINVAL,
167                 .input          = dst_discard,
168                 .output         = dst_discard,
169         },
170         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
171         .rt6i_protocol  = RTPROT_KERNEL,
172         .rt6i_metric    = ~(u32) 0,
173         .rt6i_ref       = ATOMIC_INIT(1),
174 };
175
176 #endif
177
178 /* allocate dst with ip6_dst_ops */
179 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
180 {
181         return (struct rt6_info *)dst_alloc(ops);
182 }
183
184 static void ip6_dst_destroy(struct dst_entry *dst)
185 {
186         struct rt6_info *rt = (struct rt6_info *)dst;
187         struct inet6_dev *idev = rt->rt6i_idev;
188         struct inet_peer *peer = rt->rt6i_peer;
189
190         if (idev != NULL) {
191                 rt->rt6i_idev = NULL;
192                 in6_dev_put(idev);
193         }
194         if (peer) {
195                 BUG_ON(!(rt->rt6i_flags & RTF_CACHE));
196                 rt->rt6i_peer = NULL;
197                 inet_putpeer(peer);
198         }
199 }
200
201 void rt6_bind_peer(struct rt6_info *rt, int create)
202 {
203         struct inet_peer *peer;
204
205         if (WARN_ON(!(rt->rt6i_flags & RTF_CACHE)))
206                 return;
207
208         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
209         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
210                 inet_putpeer(peer);
211 }
212
213 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
214                            int how)
215 {
216         struct rt6_info *rt = (struct rt6_info *)dst;
217         struct inet6_dev *idev = rt->rt6i_idev;
218         struct net_device *loopback_dev =
219                 dev_net(dev)->loopback_dev;
220
221         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
222                 struct inet6_dev *loopback_idev =
223                         in6_dev_get(loopback_dev);
224                 if (loopback_idev != NULL) {
225                         rt->rt6i_idev = loopback_idev;
226                         in6_dev_put(idev);
227                 }
228         }
229 }
230
231 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
232 {
233         return (rt->rt6i_flags & RTF_EXPIRES) &&
234                 time_after(jiffies, rt->rt6i_expires);
235 }
236
237 static inline int rt6_need_strict(struct in6_addr *daddr)
238 {
239         return ipv6_addr_type(daddr) &
240                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
241 }
242
243 /*
244  *      Route lookup. Any table->tb6_lock is implied.
245  */
246
247 static inline struct rt6_info *rt6_device_match(struct net *net,
248                                                     struct rt6_info *rt,
249                                                     struct in6_addr *saddr,
250                                                     int oif,
251                                                     int flags)
252 {
253         struct rt6_info *local = NULL;
254         struct rt6_info *sprt;
255
256         if (!oif && ipv6_addr_any(saddr))
257                 goto out;
258
259         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
260                 struct net_device *dev = sprt->rt6i_dev;
261
262                 if (oif) {
263                         if (dev->ifindex == oif)
264                                 return sprt;
265                         if (dev->flags & IFF_LOOPBACK) {
266                                 if (sprt->rt6i_idev == NULL ||
267                                     sprt->rt6i_idev->dev->ifindex != oif) {
268                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
269                                                 continue;
270                                         if (local && (!oif ||
271                                                       local->rt6i_idev->dev->ifindex == oif))
272                                                 continue;
273                                 }
274                                 local = sprt;
275                         }
276                 } else {
277                         if (ipv6_chk_addr(net, saddr, dev,
278                                           flags & RT6_LOOKUP_F_IFACE))
279                                 return sprt;
280                 }
281         }
282
283         if (oif) {
284                 if (local)
285                         return local;
286
287                 if (flags & RT6_LOOKUP_F_IFACE)
288                         return net->ipv6.ip6_null_entry;
289         }
290 out:
291         return rt;
292 }
293
294 #ifdef CONFIG_IPV6_ROUTER_PREF
295 static void rt6_probe(struct rt6_info *rt)
296 {
297         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
298         /*
299          * Okay, this does not seem to be appropriate
300          * for now, however, we need to check if it
301          * is really so; aka Router Reachability Probing.
302          *
303          * Router Reachability Probe MUST be rate-limited
304          * to no more than one per minute.
305          */
306         if (!neigh || (neigh->nud_state & NUD_VALID))
307                 return;
308         read_lock_bh(&neigh->lock);
309         if (!(neigh->nud_state & NUD_VALID) &&
310             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
311                 struct in6_addr mcaddr;
312                 struct in6_addr *target;
313
314                 neigh->updated = jiffies;
315                 read_unlock_bh(&neigh->lock);
316
317                 target = (struct in6_addr *)&neigh->primary_key;
318                 addrconf_addr_solict_mult(target, &mcaddr);
319                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
320         } else
321                 read_unlock_bh(&neigh->lock);
322 }
323 #else
324 static inline void rt6_probe(struct rt6_info *rt)
325 {
326 }
327 #endif
328
329 /*
330  * Default Router Selection (RFC 2461 6.3.6)
331  */
332 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
333 {
334         struct net_device *dev = rt->rt6i_dev;
335         if (!oif || dev->ifindex == oif)
336                 return 2;
337         if ((dev->flags & IFF_LOOPBACK) &&
338             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
339                 return 1;
340         return 0;
341 }
342
343 static inline int rt6_check_neigh(struct rt6_info *rt)
344 {
345         struct neighbour *neigh = rt->rt6i_nexthop;
346         int m;
347         if (rt->rt6i_flags & RTF_NONEXTHOP ||
348             !(rt->rt6i_flags & RTF_GATEWAY))
349                 m = 1;
350         else if (neigh) {
351                 read_lock_bh(&neigh->lock);
352                 if (neigh->nud_state & NUD_VALID)
353                         m = 2;
354 #ifdef CONFIG_IPV6_ROUTER_PREF
355                 else if (neigh->nud_state & NUD_FAILED)
356                         m = 0;
357 #endif
358                 else
359                         m = 1;
360                 read_unlock_bh(&neigh->lock);
361         } else
362                 m = 0;
363         return m;
364 }
365
366 static int rt6_score_route(struct rt6_info *rt, int oif,
367                            int strict)
368 {
369         int m, n;
370
371         m = rt6_check_dev(rt, oif);
372         if (!m && (strict & RT6_LOOKUP_F_IFACE))
373                 return -1;
374 #ifdef CONFIG_IPV6_ROUTER_PREF
375         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
376 #endif
377         n = rt6_check_neigh(rt);
378         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
379                 return -1;
380         return m;
381 }
382
383 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
384                                    int *mpri, struct rt6_info *match)
385 {
386         int m;
387
388         if (rt6_check_expired(rt))
389                 goto out;
390
391         m = rt6_score_route(rt, oif, strict);
392         if (m < 0)
393                 goto out;
394
395         if (m > *mpri) {
396                 if (strict & RT6_LOOKUP_F_REACHABLE)
397                         rt6_probe(match);
398                 *mpri = m;
399                 match = rt;
400         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
401                 rt6_probe(rt);
402         }
403
404 out:
405         return match;
406 }
407
408 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
409                                      struct rt6_info *rr_head,
410                                      u32 metric, int oif, int strict)
411 {
412         struct rt6_info *rt, *match;
413         int mpri = -1;
414
415         match = NULL;
416         for (rt = rr_head; rt && rt->rt6i_metric == metric;
417              rt = rt->dst.rt6_next)
418                 match = find_match(rt, oif, strict, &mpri, match);
419         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
420              rt = rt->dst.rt6_next)
421                 match = find_match(rt, oif, strict, &mpri, match);
422
423         return match;
424 }
425
426 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
427 {
428         struct rt6_info *match, *rt0;
429         struct net *net;
430
431         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
432                   __func__, fn->leaf, oif);
433
434         rt0 = fn->rr_ptr;
435         if (!rt0)
436                 fn->rr_ptr = rt0 = fn->leaf;
437
438         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
439
440         if (!match &&
441             (strict & RT6_LOOKUP_F_REACHABLE)) {
442                 struct rt6_info *next = rt0->dst.rt6_next;
443
444                 /* no entries matched; do round-robin */
445                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
446                         next = fn->leaf;
447
448                 if (next != rt0)
449                         fn->rr_ptr = next;
450         }
451
452         RT6_TRACE("%s() => %p\n",
453                   __func__, match);
454
455         net = dev_net(rt0->rt6i_dev);
456         return match ? match : net->ipv6.ip6_null_entry;
457 }
458
459 #ifdef CONFIG_IPV6_ROUTE_INFO
460 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
461                   struct in6_addr *gwaddr)
462 {
463         struct net *net = dev_net(dev);
464         struct route_info *rinfo = (struct route_info *) opt;
465         struct in6_addr prefix_buf, *prefix;
466         unsigned int pref;
467         unsigned long lifetime;
468         struct rt6_info *rt;
469
470         if (len < sizeof(struct route_info)) {
471                 return -EINVAL;
472         }
473
474         /* Sanity check for prefix_len and length */
475         if (rinfo->length > 3) {
476                 return -EINVAL;
477         } else if (rinfo->prefix_len > 128) {
478                 return -EINVAL;
479         } else if (rinfo->prefix_len > 64) {
480                 if (rinfo->length < 2) {
481                         return -EINVAL;
482                 }
483         } else if (rinfo->prefix_len > 0) {
484                 if (rinfo->length < 1) {
485                         return -EINVAL;
486                 }
487         }
488
489         pref = rinfo->route_pref;
490         if (pref == ICMPV6_ROUTER_PREF_INVALID)
491                 return -EINVAL;
492
493         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
494
495         if (rinfo->length == 3)
496                 prefix = (struct in6_addr *)rinfo->prefix;
497         else {
498                 /* this function is safe */
499                 ipv6_addr_prefix(&prefix_buf,
500                                  (struct in6_addr *)rinfo->prefix,
501                                  rinfo->prefix_len);
502                 prefix = &prefix_buf;
503         }
504
505         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
506                                 dev->ifindex);
507
508         if (rt && !lifetime) {
509                 ip6_del_rt(rt);
510                 rt = NULL;
511         }
512
513         if (!rt && lifetime)
514                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
515                                         pref);
516         else if (rt)
517                 rt->rt6i_flags = RTF_ROUTEINFO |
518                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
519
520         if (rt) {
521                 if (!addrconf_finite_timeout(lifetime)) {
522                         rt->rt6i_flags &= ~RTF_EXPIRES;
523                 } else {
524                         rt->rt6i_expires = jiffies + HZ * lifetime;
525                         rt->rt6i_flags |= RTF_EXPIRES;
526                 }
527                 dst_release(&rt->dst);
528         }
529         return 0;
530 }
531 #endif
532
533 #define BACKTRACK(__net, saddr)                 \
534 do { \
535         if (rt == __net->ipv6.ip6_null_entry) { \
536                 struct fib6_node *pn; \
537                 while (1) { \
538                         if (fn->fn_flags & RTN_TL_ROOT) \
539                                 goto out; \
540                         pn = fn->parent; \
541                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
542                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
543                         else \
544                                 fn = pn; \
545                         if (fn->fn_flags & RTN_RTINFO) \
546                                 goto restart; \
547                 } \
548         } \
549 } while(0)
550
551 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
552                                              struct fib6_table *table,
553                                              struct flowi *fl, int flags)
554 {
555         struct fib6_node *fn;
556         struct rt6_info *rt;
557
558         read_lock_bh(&table->tb6_lock);
559         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
560 restart:
561         rt = fn->leaf;
562         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
563         BACKTRACK(net, &fl->fl6_src);
564 out:
565         dst_use(&rt->dst, jiffies);
566         read_unlock_bh(&table->tb6_lock);
567         return rt;
568
569 }
570
571 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
572                             const struct in6_addr *saddr, int oif, int strict)
573 {
574         struct flowi fl = {
575                 .oif = oif,
576                 .fl6_dst = *daddr,
577         };
578         struct dst_entry *dst;
579         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
580
581         if (saddr) {
582                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
583                 flags |= RT6_LOOKUP_F_HAS_SADDR;
584         }
585
586         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
587         if (dst->error == 0)
588                 return (struct rt6_info *) dst;
589
590         dst_release(dst);
591
592         return NULL;
593 }
594
595 EXPORT_SYMBOL(rt6_lookup);
596
597 /* ip6_ins_rt is called with FREE table->tb6_lock.
598    It takes new route entry, the addition fails by any reason the
599    route is freed. In any case, if caller does not hold it, it may
600    be destroyed.
601  */
602
603 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
604 {
605         int err;
606         struct fib6_table *table;
607
608         table = rt->rt6i_table;
609         write_lock_bh(&table->tb6_lock);
610         err = fib6_add(&table->tb6_root, rt, info);
611         write_unlock_bh(&table->tb6_lock);
612
613         return err;
614 }
615
616 int ip6_ins_rt(struct rt6_info *rt)
617 {
618         struct nl_info info = {
619                 .nl_net = dev_net(rt->rt6i_dev),
620         };
621         return __ip6_ins_rt(rt, &info);
622 }
623
624 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
625                                       struct in6_addr *saddr)
626 {
627         struct rt6_info *rt;
628
629         /*
630          *      Clone the route.
631          */
632
633         rt = ip6_rt_copy(ort);
634
635         if (rt) {
636                 struct neighbour *neigh;
637                 int attempts = !in_softirq();
638
639                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
640                         if (rt->rt6i_dst.plen != 128 &&
641                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
642                                 rt->rt6i_flags |= RTF_ANYCAST;
643                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
644                 }
645
646                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
647                 rt->rt6i_dst.plen = 128;
648                 rt->rt6i_flags |= RTF_CACHE;
649                 rt->dst.flags |= DST_HOST;
650
651 #ifdef CONFIG_IPV6_SUBTREES
652                 if (rt->rt6i_src.plen && saddr) {
653                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
654                         rt->rt6i_src.plen = 128;
655                 }
656 #endif
657
658         retry:
659                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
660                 if (IS_ERR(neigh)) {
661                         struct net *net = dev_net(rt->rt6i_dev);
662                         int saved_rt_min_interval =
663                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
664                         int saved_rt_elasticity =
665                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
666
667                         if (attempts-- > 0) {
668                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
669                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
670
671                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
672
673                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
674                                         saved_rt_elasticity;
675                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
676                                         saved_rt_min_interval;
677                                 goto retry;
678                         }
679
680                         if (net_ratelimit())
681                                 printk(KERN_WARNING
682                                        "ipv6: Neighbour table overflow.\n");
683                         dst_free(&rt->dst);
684                         return NULL;
685                 }
686                 rt->rt6i_nexthop = neigh;
687
688         }
689
690         return rt;
691 }
692
693 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
694 {
695         struct rt6_info *rt = ip6_rt_copy(ort);
696         if (rt) {
697                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
698                 rt->rt6i_dst.plen = 128;
699                 rt->rt6i_flags |= RTF_CACHE;
700                 rt->dst.flags |= DST_HOST;
701                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
702         }
703         return rt;
704 }
705
706 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
707                                       struct flowi *fl, int flags)
708 {
709         struct fib6_node *fn;
710         struct rt6_info *rt, *nrt;
711         int strict = 0;
712         int attempts = 3;
713         int err;
714         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
715
716         strict |= flags & RT6_LOOKUP_F_IFACE;
717
718 relookup:
719         read_lock_bh(&table->tb6_lock);
720
721 restart_2:
722         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
723
724 restart:
725         rt = rt6_select(fn, oif, strict | reachable);
726
727         BACKTRACK(net, &fl->fl6_src);
728         if (rt == net->ipv6.ip6_null_entry ||
729             rt->rt6i_flags & RTF_CACHE)
730                 goto out;
731
732         dst_hold(&rt->dst);
733         read_unlock_bh(&table->tb6_lock);
734
735         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
736                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
737         else {
738 #if CLONE_OFFLINK_ROUTE
739                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
740 #else
741                 goto out2;
742 #endif
743         }
744
745         dst_release(&rt->dst);
746         rt = nrt ? : net->ipv6.ip6_null_entry;
747
748         dst_hold(&rt->dst);
749         if (nrt) {
750                 err = ip6_ins_rt(nrt);
751                 if (!err)
752                         goto out2;
753         }
754
755         if (--attempts <= 0)
756                 goto out2;
757
758         /*
759          * Race condition! In the gap, when table->tb6_lock was
760          * released someone could insert this route.  Relookup.
761          */
762         dst_release(&rt->dst);
763         goto relookup;
764
765 out:
766         if (reachable) {
767                 reachable = 0;
768                 goto restart_2;
769         }
770         dst_hold(&rt->dst);
771         read_unlock_bh(&table->tb6_lock);
772 out2:
773         rt->dst.lastuse = jiffies;
774         rt->dst.__use++;
775
776         return rt;
777 }
778
779 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
780                                             struct flowi *fl, int flags)
781 {
782         return ip6_pol_route(net, table, fl->iif, fl, flags);
783 }
784
785 void ip6_route_input(struct sk_buff *skb)
786 {
787         struct ipv6hdr *iph = ipv6_hdr(skb);
788         struct net *net = dev_net(skb->dev);
789         int flags = RT6_LOOKUP_F_HAS_SADDR;
790         struct flowi fl = {
791                 .iif = skb->dev->ifindex,
792                 .fl6_dst = iph->daddr,
793                 .fl6_src = iph->saddr,
794                 .fl6_flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
795                 .mark = skb->mark,
796                 .proto = iph->nexthdr,
797         };
798
799         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
800                 flags |= RT6_LOOKUP_F_IFACE;
801
802         skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
803 }
804
805 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
806                                              struct flowi *fl, int flags)
807 {
808         return ip6_pol_route(net, table, fl->oif, fl, flags);
809 }
810
811 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
812                                     struct flowi *fl)
813 {
814         int flags = 0;
815
816         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
817                 flags |= RT6_LOOKUP_F_IFACE;
818
819         if (!ipv6_addr_any(&fl->fl6_src))
820                 flags |= RT6_LOOKUP_F_HAS_SADDR;
821         else if (sk)
822                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
823
824         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
825 }
826
827 EXPORT_SYMBOL(ip6_route_output);
828
829 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
830 {
831         struct rt6_info *ort = (struct rt6_info *) *dstp;
832         struct rt6_info *rt = (struct rt6_info *)
833                 dst_alloc(&ip6_dst_blackhole_ops);
834         struct dst_entry *new = NULL;
835
836         if (rt) {
837                 new = &rt->dst;
838
839                 atomic_set(&new->__refcnt, 1);
840                 new->__use = 1;
841                 new->input = dst_discard;
842                 new->output = dst_discard;
843
844                 dst_copy_metrics(new, &ort->dst);
845                 new->dev = ort->dst.dev;
846                 if (new->dev)
847                         dev_hold(new->dev);
848                 rt->rt6i_idev = ort->rt6i_idev;
849                 if (rt->rt6i_idev)
850                         in6_dev_hold(rt->rt6i_idev);
851                 rt->rt6i_expires = 0;
852
853                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
854                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
855                 rt->rt6i_metric = 0;
856
857                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
858 #ifdef CONFIG_IPV6_SUBTREES
859                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
860 #endif
861
862                 dst_free(new);
863         }
864
865         dst_release(*dstp);
866         *dstp = new;
867         return new ? 0 : -ENOMEM;
868 }
869 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
870
871 /*
872  *      Destination cache support functions
873  */
874
875 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
876 {
877         struct rt6_info *rt;
878
879         rt = (struct rt6_info *) dst;
880
881         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
882                 return dst;
883
884         return NULL;
885 }
886
887 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
888 {
889         struct rt6_info *rt = (struct rt6_info *) dst;
890
891         if (rt) {
892                 if (rt->rt6i_flags & RTF_CACHE) {
893                         if (rt6_check_expired(rt)) {
894                                 ip6_del_rt(rt);
895                                 dst = NULL;
896                         }
897                 } else {
898                         dst_release(dst);
899                         dst = NULL;
900                 }
901         }
902         return dst;
903 }
904
905 static void ip6_link_failure(struct sk_buff *skb)
906 {
907         struct rt6_info *rt;
908
909         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
910
911         rt = (struct rt6_info *) skb_dst(skb);
912         if (rt) {
913                 if (rt->rt6i_flags&RTF_CACHE) {
914                         dst_set_expires(&rt->dst, 0);
915                         rt->rt6i_flags |= RTF_EXPIRES;
916                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
917                         rt->rt6i_node->fn_sernum = -1;
918         }
919 }
920
921 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
922 {
923         struct rt6_info *rt6 = (struct rt6_info*)dst;
924
925         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
926                 rt6->rt6i_flags |= RTF_MODIFIED;
927                 if (mtu < IPV6_MIN_MTU) {
928                         u32 features = dst_metric(dst, RTAX_FEATURES);
929                         mtu = IPV6_MIN_MTU;
930                         features |= RTAX_FEATURE_ALLFRAG;
931                         dst_metric_set(dst, RTAX_FEATURES, features);
932                 }
933                 dst_metric_set(dst, RTAX_MTU, mtu);
934                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
935         }
936 }
937
938 static int ipv6_get_mtu(struct net_device *dev);
939
940 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
941 {
942         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
943
944         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
945                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
946
947         /*
948          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
949          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
950          * IPV6_MAXPLEN is also valid and means: "any MSS,
951          * rely only on pmtu discovery"
952          */
953         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
954                 mtu = IPV6_MAXPLEN;
955         return mtu;
956 }
957
958 static struct dst_entry *icmp6_dst_gc_list;
959 static DEFINE_SPINLOCK(icmp6_dst_lock);
960
961 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
962                                   struct neighbour *neigh,
963                                   const struct in6_addr *addr)
964 {
965         struct rt6_info *rt;
966         struct inet6_dev *idev = in6_dev_get(dev);
967         struct net *net = dev_net(dev);
968
969         if (unlikely(idev == NULL))
970                 return NULL;
971
972         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
973         if (unlikely(rt == NULL)) {
974                 in6_dev_put(idev);
975                 goto out;
976         }
977
978         dev_hold(dev);
979         if (neigh)
980                 neigh_hold(neigh);
981         else {
982                 neigh = ndisc_get_neigh(dev, addr);
983                 if (IS_ERR(neigh))
984                         neigh = NULL;
985         }
986
987         rt->rt6i_dev      = dev;
988         rt->rt6i_idev     = idev;
989         rt->rt6i_nexthop  = neigh;
990         atomic_set(&rt->dst.__refcnt, 1);
991         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
992         dst_metric_set(&rt->dst, RTAX_MTU, ipv6_get_mtu(rt->rt6i_dev));
993         dst_metric_set(&rt->dst, RTAX_ADVMSS, ipv6_advmss(net, dst_mtu(&rt->dst)));
994         rt->dst.output  = ip6_output;
995
996 #if 0   /* there's no chance to use these for ndisc */
997         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
998                                 ? DST_HOST
999                                 : 0;
1000         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1001         rt->rt6i_dst.plen = 128;
1002 #endif
1003
1004         spin_lock_bh(&icmp6_dst_lock);
1005         rt->dst.next = icmp6_dst_gc_list;
1006         icmp6_dst_gc_list = &rt->dst;
1007         spin_unlock_bh(&icmp6_dst_lock);
1008
1009         fib6_force_start_gc(net);
1010
1011 out:
1012         return &rt->dst;
1013 }
1014
1015 int icmp6_dst_gc(void)
1016 {
1017         struct dst_entry *dst, *next, **pprev;
1018         int more = 0;
1019
1020         next = NULL;
1021
1022         spin_lock_bh(&icmp6_dst_lock);
1023         pprev = &icmp6_dst_gc_list;
1024
1025         while ((dst = *pprev) != NULL) {
1026                 if (!atomic_read(&dst->__refcnt)) {
1027                         *pprev = dst->next;
1028                         dst_free(dst);
1029                 } else {
1030                         pprev = &dst->next;
1031                         ++more;
1032                 }
1033         }
1034
1035         spin_unlock_bh(&icmp6_dst_lock);
1036
1037         return more;
1038 }
1039
1040 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1041                             void *arg)
1042 {
1043         struct dst_entry *dst, **pprev;
1044
1045         spin_lock_bh(&icmp6_dst_lock);
1046         pprev = &icmp6_dst_gc_list;
1047         while ((dst = *pprev) != NULL) {
1048                 struct rt6_info *rt = (struct rt6_info *) dst;
1049                 if (func(rt, arg)) {
1050                         *pprev = dst->next;
1051                         dst_free(dst);
1052                 } else {
1053                         pprev = &dst->next;
1054                 }
1055         }
1056         spin_unlock_bh(&icmp6_dst_lock);
1057 }
1058
1059 static int ip6_dst_gc(struct dst_ops *ops)
1060 {
1061         unsigned long now = jiffies;
1062         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1063         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1064         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1065         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1066         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1067         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1068         int entries;
1069
1070         entries = dst_entries_get_fast(ops);
1071         if (time_after(rt_last_gc + rt_min_interval, now) &&
1072             entries <= rt_max_size)
1073                 goto out;
1074
1075         net->ipv6.ip6_rt_gc_expire++;
1076         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1077         net->ipv6.ip6_rt_last_gc = now;
1078         entries = dst_entries_get_slow(ops);
1079         if (entries < ops->gc_thresh)
1080                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1081 out:
1082         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1083         return entries > rt_max_size;
1084 }
1085
1086 /* Clean host part of a prefix. Not necessary in radix tree,
1087    but results in cleaner routing tables.
1088
1089    Remove it only when all the things will work!
1090  */
1091
1092 static int ipv6_get_mtu(struct net_device *dev)
1093 {
1094         int mtu = IPV6_MIN_MTU;
1095         struct inet6_dev *idev;
1096
1097         rcu_read_lock();
1098         idev = __in6_dev_get(dev);
1099         if (idev)
1100                 mtu = idev->cnf.mtu6;
1101         rcu_read_unlock();
1102         return mtu;
1103 }
1104
1105 int ip6_dst_hoplimit(struct dst_entry *dst)
1106 {
1107         int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1108         if (hoplimit < 0) {
1109                 struct net_device *dev = dst->dev;
1110                 struct inet6_dev *idev;
1111
1112                 rcu_read_lock();
1113                 idev = __in6_dev_get(dev);
1114                 if (idev)
1115                         hoplimit = idev->cnf.hop_limit;
1116                 else
1117                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1118                 rcu_read_unlock();
1119         }
1120         return hoplimit;
1121 }
1122
1123 /*
1124  *
1125  */
1126
1127 int ip6_route_add(struct fib6_config *cfg)
1128 {
1129         int err;
1130         struct net *net = cfg->fc_nlinfo.nl_net;
1131         struct rt6_info *rt = NULL;
1132         struct net_device *dev = NULL;
1133         struct inet6_dev *idev = NULL;
1134         struct fib6_table *table;
1135         int addr_type;
1136
1137         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1138                 return -EINVAL;
1139 #ifndef CONFIG_IPV6_SUBTREES
1140         if (cfg->fc_src_len)
1141                 return -EINVAL;
1142 #endif
1143         if (cfg->fc_ifindex) {
1144                 err = -ENODEV;
1145                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1146                 if (!dev)
1147                         goto out;
1148                 idev = in6_dev_get(dev);
1149                 if (!idev)
1150                         goto out;
1151         }
1152
1153         if (cfg->fc_metric == 0)
1154                 cfg->fc_metric = IP6_RT_PRIO_USER;
1155
1156         table = fib6_new_table(net, cfg->fc_table);
1157         if (table == NULL) {
1158                 err = -ENOBUFS;
1159                 goto out;
1160         }
1161
1162         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1163
1164         if (rt == NULL) {
1165                 err = -ENOMEM;
1166                 goto out;
1167         }
1168
1169         rt->dst.obsolete = -1;
1170         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1171                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1172                                 0;
1173
1174         if (cfg->fc_protocol == RTPROT_UNSPEC)
1175                 cfg->fc_protocol = RTPROT_BOOT;
1176         rt->rt6i_protocol = cfg->fc_protocol;
1177
1178         addr_type = ipv6_addr_type(&cfg->fc_dst);
1179
1180         if (addr_type & IPV6_ADDR_MULTICAST)
1181                 rt->dst.input = ip6_mc_input;
1182         else if (cfg->fc_flags & RTF_LOCAL)
1183                 rt->dst.input = ip6_input;
1184         else
1185                 rt->dst.input = ip6_forward;
1186
1187         rt->dst.output = ip6_output;
1188
1189         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1190         rt->rt6i_dst.plen = cfg->fc_dst_len;
1191         if (rt->rt6i_dst.plen == 128)
1192                rt->dst.flags = DST_HOST;
1193
1194 #ifdef CONFIG_IPV6_SUBTREES
1195         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1196         rt->rt6i_src.plen = cfg->fc_src_len;
1197 #endif
1198
1199         rt->rt6i_metric = cfg->fc_metric;
1200
1201         /* We cannot add true routes via loopback here,
1202            they would result in kernel looping; promote them to reject routes
1203          */
1204         if ((cfg->fc_flags & RTF_REJECT) ||
1205             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1206                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1207                 /* hold loopback dev/idev if we haven't done so. */
1208                 if (dev != net->loopback_dev) {
1209                         if (dev) {
1210                                 dev_put(dev);
1211                                 in6_dev_put(idev);
1212                         }
1213                         dev = net->loopback_dev;
1214                         dev_hold(dev);
1215                         idev = in6_dev_get(dev);
1216                         if (!idev) {
1217                                 err = -ENODEV;
1218                                 goto out;
1219                         }
1220                 }
1221                 rt->dst.output = ip6_pkt_discard_out;
1222                 rt->dst.input = ip6_pkt_discard;
1223                 rt->dst.error = -ENETUNREACH;
1224                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1225                 goto install_route;
1226         }
1227
1228         if (cfg->fc_flags & RTF_GATEWAY) {
1229                 struct in6_addr *gw_addr;
1230                 int gwa_type;
1231
1232                 gw_addr = &cfg->fc_gateway;
1233                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1234                 gwa_type = ipv6_addr_type(gw_addr);
1235
1236                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1237                         struct rt6_info *grt;
1238
1239                         /* IPv6 strictly inhibits using not link-local
1240                            addresses as nexthop address.
1241                            Otherwise, router will not able to send redirects.
1242                            It is very good, but in some (rare!) circumstances
1243                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1244                            some exceptions. --ANK
1245                          */
1246                         err = -EINVAL;
1247                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1248                                 goto out;
1249
1250                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1251
1252                         err = -EHOSTUNREACH;
1253                         if (grt == NULL)
1254                                 goto out;
1255                         if (dev) {
1256                                 if (dev != grt->rt6i_dev) {
1257                                         dst_release(&grt->dst);
1258                                         goto out;
1259                                 }
1260                         } else {
1261                                 dev = grt->rt6i_dev;
1262                                 idev = grt->rt6i_idev;
1263                                 dev_hold(dev);
1264                                 in6_dev_hold(grt->rt6i_idev);
1265                         }
1266                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1267                                 err = 0;
1268                         dst_release(&grt->dst);
1269
1270                         if (err)
1271                                 goto out;
1272                 }
1273                 err = -EINVAL;
1274                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1275                         goto out;
1276         }
1277
1278         err = -ENODEV;
1279         if (dev == NULL)
1280                 goto out;
1281
1282         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1283                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1284                 if (IS_ERR(rt->rt6i_nexthop)) {
1285                         err = PTR_ERR(rt->rt6i_nexthop);
1286                         rt->rt6i_nexthop = NULL;
1287                         goto out;
1288                 }
1289         }
1290
1291         rt->rt6i_flags = cfg->fc_flags;
1292
1293 install_route:
1294         if (cfg->fc_mx) {
1295                 struct nlattr *nla;
1296                 int remaining;
1297
1298                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1299                         int type = nla_type(nla);
1300
1301                         if (type) {
1302                                 if (type > RTAX_MAX) {
1303                                         err = -EINVAL;
1304                                         goto out;
1305                                 }
1306
1307                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1308                         }
1309                 }
1310         }
1311
1312         if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1313                 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
1314         if (!dst_mtu(&rt->dst))
1315                 dst_metric_set(&rt->dst, RTAX_MTU, ipv6_get_mtu(dev));
1316         if (!dst_metric(&rt->dst, RTAX_ADVMSS))
1317                 dst_metric_set(&rt->dst, RTAX_ADVMSS, ipv6_advmss(net, dst_mtu(&rt->dst)));
1318         rt->dst.dev = dev;
1319         rt->rt6i_idev = idev;
1320         rt->rt6i_table = table;
1321
1322         cfg->fc_nlinfo.nl_net = dev_net(dev);
1323
1324         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1325
1326 out:
1327         if (dev)
1328                 dev_put(dev);
1329         if (idev)
1330                 in6_dev_put(idev);
1331         if (rt)
1332                 dst_free(&rt->dst);
1333         return err;
1334 }
1335
1336 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1337 {
1338         int err;
1339         struct fib6_table *table;
1340         struct net *net = dev_net(rt->rt6i_dev);
1341
1342         if (rt == net->ipv6.ip6_null_entry)
1343                 return -ENOENT;
1344
1345         table = rt->rt6i_table;
1346         write_lock_bh(&table->tb6_lock);
1347
1348         err = fib6_del(rt, info);
1349         dst_release(&rt->dst);
1350
1351         write_unlock_bh(&table->tb6_lock);
1352
1353         return err;
1354 }
1355
1356 int ip6_del_rt(struct rt6_info *rt)
1357 {
1358         struct nl_info info = {
1359                 .nl_net = dev_net(rt->rt6i_dev),
1360         };
1361         return __ip6_del_rt(rt, &info);
1362 }
1363
1364 static int ip6_route_del(struct fib6_config *cfg)
1365 {
1366         struct fib6_table *table;
1367         struct fib6_node *fn;
1368         struct rt6_info *rt;
1369         int err = -ESRCH;
1370
1371         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1372         if (table == NULL)
1373                 return err;
1374
1375         read_lock_bh(&table->tb6_lock);
1376
1377         fn = fib6_locate(&table->tb6_root,
1378                          &cfg->fc_dst, cfg->fc_dst_len,
1379                          &cfg->fc_src, cfg->fc_src_len);
1380
1381         if (fn) {
1382                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1383                         if (cfg->fc_ifindex &&
1384                             (rt->rt6i_dev == NULL ||
1385                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1386                                 continue;
1387                         if (cfg->fc_flags & RTF_GATEWAY &&
1388                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1389                                 continue;
1390                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1391                                 continue;
1392                         dst_hold(&rt->dst);
1393                         read_unlock_bh(&table->tb6_lock);
1394
1395                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1396                 }
1397         }
1398         read_unlock_bh(&table->tb6_lock);
1399
1400         return err;
1401 }
1402
1403 /*
1404  *      Handle redirects
1405  */
1406 struct ip6rd_flowi {
1407         struct flowi fl;
1408         struct in6_addr gateway;
1409 };
1410
1411 static struct rt6_info *__ip6_route_redirect(struct net *net,
1412                                              struct fib6_table *table,
1413                                              struct flowi *fl,
1414                                              int flags)
1415 {
1416         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1417         struct rt6_info *rt;
1418         struct fib6_node *fn;
1419
1420         /*
1421          * Get the "current" route for this destination and
1422          * check if the redirect has come from approriate router.
1423          *
1424          * RFC 2461 specifies that redirects should only be
1425          * accepted if they come from the nexthop to the target.
1426          * Due to the way the routes are chosen, this notion
1427          * is a bit fuzzy and one might need to check all possible
1428          * routes.
1429          */
1430
1431         read_lock_bh(&table->tb6_lock);
1432         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1433 restart:
1434         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1435                 /*
1436                  * Current route is on-link; redirect is always invalid.
1437                  *
1438                  * Seems, previous statement is not true. It could
1439                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1440                  * But then router serving it might decide, that we should
1441                  * know truth 8)8) --ANK (980726).
1442                  */
1443                 if (rt6_check_expired(rt))
1444                         continue;
1445                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1446                         continue;
1447                 if (fl->oif != rt->rt6i_dev->ifindex)
1448                         continue;
1449                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1450                         continue;
1451                 break;
1452         }
1453
1454         if (!rt)
1455                 rt = net->ipv6.ip6_null_entry;
1456         BACKTRACK(net, &fl->fl6_src);
1457 out:
1458         dst_hold(&rt->dst);
1459
1460         read_unlock_bh(&table->tb6_lock);
1461
1462         return rt;
1463 };
1464
1465 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1466                                            struct in6_addr *src,
1467                                            struct in6_addr *gateway,
1468                                            struct net_device *dev)
1469 {
1470         int flags = RT6_LOOKUP_F_HAS_SADDR;
1471         struct net *net = dev_net(dev);
1472         struct ip6rd_flowi rdfl = {
1473                 .fl = {
1474                         .oif = dev->ifindex,
1475                         .fl6_dst = *dest,
1476                         .fl6_src = *src,
1477                 },
1478         };
1479
1480         ipv6_addr_copy(&rdfl.gateway, gateway);
1481
1482         if (rt6_need_strict(dest))
1483                 flags |= RT6_LOOKUP_F_IFACE;
1484
1485         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1486                                                    flags, __ip6_route_redirect);
1487 }
1488
1489 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1490                   struct in6_addr *saddr,
1491                   struct neighbour *neigh, u8 *lladdr, int on_link)
1492 {
1493         struct rt6_info *rt, *nrt = NULL;
1494         struct netevent_redirect netevent;
1495         struct net *net = dev_net(neigh->dev);
1496
1497         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1498
1499         if (rt == net->ipv6.ip6_null_entry) {
1500                 if (net_ratelimit())
1501                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1502                                "for redirect target\n");
1503                 goto out;
1504         }
1505
1506         /*
1507          *      We have finally decided to accept it.
1508          */
1509
1510         neigh_update(neigh, lladdr, NUD_STALE,
1511                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1512                      NEIGH_UPDATE_F_OVERRIDE|
1513                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1514                                      NEIGH_UPDATE_F_ISROUTER))
1515                      );
1516
1517         /*
1518          * Redirect received -> path was valid.
1519          * Look, redirects are sent only in response to data packets,
1520          * so that this nexthop apparently is reachable. --ANK
1521          */
1522         dst_confirm(&rt->dst);
1523
1524         /* Duplicate redirect: silently ignore. */
1525         if (neigh == rt->dst.neighbour)
1526                 goto out;
1527
1528         nrt = ip6_rt_copy(rt);
1529         if (nrt == NULL)
1530                 goto out;
1531
1532         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1533         if (on_link)
1534                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1535
1536         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1537         nrt->rt6i_dst.plen = 128;
1538         nrt->dst.flags |= DST_HOST;
1539
1540         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1541         nrt->rt6i_nexthop = neigh_clone(neigh);
1542         /* Reset pmtu, it may be better */
1543         dst_metric_set(&nrt->dst, RTAX_MTU, ipv6_get_mtu(neigh->dev));
1544         dst_metric_set(&nrt->dst, RTAX_ADVMSS, ipv6_advmss(dev_net(neigh->dev),
1545                                                            dst_mtu(&nrt->dst)));
1546
1547         if (ip6_ins_rt(nrt))
1548                 goto out;
1549
1550         netevent.old = &rt->dst;
1551         netevent.new = &nrt->dst;
1552         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1553
1554         if (rt->rt6i_flags&RTF_CACHE) {
1555                 ip6_del_rt(rt);
1556                 return;
1557         }
1558
1559 out:
1560         dst_release(&rt->dst);
1561 }
1562
1563 /*
1564  *      Handle ICMP "packet too big" messages
1565  *      i.e. Path MTU discovery
1566  */
1567
1568 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1569                              struct net *net, u32 pmtu, int ifindex)
1570 {
1571         struct rt6_info *rt, *nrt;
1572         int allfrag = 0;
1573
1574         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1575         if (rt == NULL)
1576                 return;
1577
1578         if (pmtu >= dst_mtu(&rt->dst))
1579                 goto out;
1580
1581         if (pmtu < IPV6_MIN_MTU) {
1582                 /*
1583                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1584                  * MTU (1280) and a fragment header should always be included
1585                  * after a node receiving Too Big message reporting PMTU is
1586                  * less than the IPv6 Minimum Link MTU.
1587                  */
1588                 pmtu = IPV6_MIN_MTU;
1589                 allfrag = 1;
1590         }
1591
1592         /* New mtu received -> path was valid.
1593            They are sent only in response to data packets,
1594            so that this nexthop apparently is reachable. --ANK
1595          */
1596         dst_confirm(&rt->dst);
1597
1598         /* Host route. If it is static, it would be better
1599            not to override it, but add new one, so that
1600            when cache entry will expire old pmtu
1601            would return automatically.
1602          */
1603         if (rt->rt6i_flags & RTF_CACHE) {
1604                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1605                 if (allfrag) {
1606                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1607                         features |= RTAX_FEATURE_ALLFRAG;
1608                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1609                 }
1610                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1611                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1612                 goto out;
1613         }
1614
1615         /* Network route.
1616            Two cases are possible:
1617            1. It is connected route. Action: COW
1618            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1619          */
1620         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1621                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1622         else
1623                 nrt = rt6_alloc_clone(rt, daddr);
1624
1625         if (nrt) {
1626                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1627                 if (allfrag) {
1628                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1629                         features |= RTAX_FEATURE_ALLFRAG;
1630                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1631                 }
1632
1633                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1634                  * happened within 5 mins, the recommended timer is 10 mins.
1635                  * Here this route expiration time is set to ip6_rt_mtu_expires
1636                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1637                  * and detecting PMTU increase will be automatically happened.
1638                  */
1639                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1640                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1641
1642                 ip6_ins_rt(nrt);
1643         }
1644 out:
1645         dst_release(&rt->dst);
1646 }
1647
1648 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1649                         struct net_device *dev, u32 pmtu)
1650 {
1651         struct net *net = dev_net(dev);
1652
1653         /*
1654          * RFC 1981 states that a node "MUST reduce the size of the packets it
1655          * is sending along the path" that caused the Packet Too Big message.
1656          * Since it's not possible in the general case to determine which
1657          * interface was used to send the original packet, we update the MTU
1658          * on the interface that will be used to send future packets. We also
1659          * update the MTU on the interface that received the Packet Too Big in
1660          * case the original packet was forced out that interface with
1661          * SO_BINDTODEVICE or similar. This is the next best thing to the
1662          * correct behaviour, which would be to update the MTU on all
1663          * interfaces.
1664          */
1665         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1666         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1667 }
1668
1669 /*
1670  *      Misc support functions
1671  */
1672
1673 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1674 {
1675         struct net *net = dev_net(ort->rt6i_dev);
1676         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1677
1678         if (rt) {
1679                 rt->dst.input = ort->dst.input;
1680                 rt->dst.output = ort->dst.output;
1681
1682                 dst_copy_metrics(&rt->dst, &ort->dst);
1683                 rt->dst.error = ort->dst.error;
1684                 rt->dst.dev = ort->dst.dev;
1685                 if (rt->dst.dev)
1686                         dev_hold(rt->dst.dev);
1687                 rt->rt6i_idev = ort->rt6i_idev;
1688                 if (rt->rt6i_idev)
1689                         in6_dev_hold(rt->rt6i_idev);
1690                 rt->dst.lastuse = jiffies;
1691                 rt->rt6i_expires = 0;
1692
1693                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1694                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1695                 rt->rt6i_metric = 0;
1696
1697                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1698 #ifdef CONFIG_IPV6_SUBTREES
1699                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1700 #endif
1701                 rt->rt6i_table = ort->rt6i_table;
1702         }
1703         return rt;
1704 }
1705
1706 #ifdef CONFIG_IPV6_ROUTE_INFO
1707 static struct rt6_info *rt6_get_route_info(struct net *net,
1708                                            struct in6_addr *prefix, int prefixlen,
1709                                            struct in6_addr *gwaddr, int ifindex)
1710 {
1711         struct fib6_node *fn;
1712         struct rt6_info *rt = NULL;
1713         struct fib6_table *table;
1714
1715         table = fib6_get_table(net, RT6_TABLE_INFO);
1716         if (table == NULL)
1717                 return NULL;
1718
1719         write_lock_bh(&table->tb6_lock);
1720         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1721         if (!fn)
1722                 goto out;
1723
1724         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1725                 if (rt->rt6i_dev->ifindex != ifindex)
1726                         continue;
1727                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1728                         continue;
1729                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1730                         continue;
1731                 dst_hold(&rt->dst);
1732                 break;
1733         }
1734 out:
1735         write_unlock_bh(&table->tb6_lock);
1736         return rt;
1737 }
1738
1739 static struct rt6_info *rt6_add_route_info(struct net *net,
1740                                            struct in6_addr *prefix, int prefixlen,
1741                                            struct in6_addr *gwaddr, int ifindex,
1742                                            unsigned pref)
1743 {
1744         struct fib6_config cfg = {
1745                 .fc_table       = RT6_TABLE_INFO,
1746                 .fc_metric      = IP6_RT_PRIO_USER,
1747                 .fc_ifindex     = ifindex,
1748                 .fc_dst_len     = prefixlen,
1749                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1750                                   RTF_UP | RTF_PREF(pref),
1751                 .fc_nlinfo.pid = 0,
1752                 .fc_nlinfo.nlh = NULL,
1753                 .fc_nlinfo.nl_net = net,
1754         };
1755
1756         ipv6_addr_copy(&cfg.fc_dst, prefix);
1757         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1758
1759         /* We should treat it as a default route if prefix length is 0. */
1760         if (!prefixlen)
1761                 cfg.fc_flags |= RTF_DEFAULT;
1762
1763         ip6_route_add(&cfg);
1764
1765         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1766 }
1767 #endif
1768
1769 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1770 {
1771         struct rt6_info *rt;
1772         struct fib6_table *table;
1773
1774         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1775         if (table == NULL)
1776                 return NULL;
1777
1778         write_lock_bh(&table->tb6_lock);
1779         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1780                 if (dev == rt->rt6i_dev &&
1781                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1782                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1783                         break;
1784         }
1785         if (rt)
1786                 dst_hold(&rt->dst);
1787         write_unlock_bh(&table->tb6_lock);
1788         return rt;
1789 }
1790
1791 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1792                                      struct net_device *dev,
1793                                      unsigned int pref)
1794 {
1795         struct fib6_config cfg = {
1796                 .fc_table       = RT6_TABLE_DFLT,
1797                 .fc_metric      = IP6_RT_PRIO_USER,
1798                 .fc_ifindex     = dev->ifindex,
1799                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1800                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1801                 .fc_nlinfo.pid = 0,
1802                 .fc_nlinfo.nlh = NULL,
1803                 .fc_nlinfo.nl_net = dev_net(dev),
1804         };
1805
1806         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1807
1808         ip6_route_add(&cfg);
1809
1810         return rt6_get_dflt_router(gwaddr, dev);
1811 }
1812
1813 void rt6_purge_dflt_routers(struct net *net)
1814 {
1815         struct rt6_info *rt;
1816         struct fib6_table *table;
1817
1818         /* NOTE: Keep consistent with rt6_get_dflt_router */
1819         table = fib6_get_table(net, RT6_TABLE_DFLT);
1820         if (table == NULL)
1821                 return;
1822
1823 restart:
1824         read_lock_bh(&table->tb6_lock);
1825         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1826                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1827                         dst_hold(&rt->dst);
1828                         read_unlock_bh(&table->tb6_lock);
1829                         ip6_del_rt(rt);
1830                         goto restart;
1831                 }
1832         }
1833         read_unlock_bh(&table->tb6_lock);
1834 }
1835
1836 static void rtmsg_to_fib6_config(struct net *net,
1837                                  struct in6_rtmsg *rtmsg,
1838                                  struct fib6_config *cfg)
1839 {
1840         memset(cfg, 0, sizeof(*cfg));
1841
1842         cfg->fc_table = RT6_TABLE_MAIN;
1843         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1844         cfg->fc_metric = rtmsg->rtmsg_metric;
1845         cfg->fc_expires = rtmsg->rtmsg_info;
1846         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1847         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1848         cfg->fc_flags = rtmsg->rtmsg_flags;
1849
1850         cfg->fc_nlinfo.nl_net = net;
1851
1852         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1853         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1854         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1855 }
1856
1857 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1858 {
1859         struct fib6_config cfg;
1860         struct in6_rtmsg rtmsg;
1861         int err;
1862
1863         switch(cmd) {
1864         case SIOCADDRT:         /* Add a route */
1865         case SIOCDELRT:         /* Delete a route */
1866                 if (!capable(CAP_NET_ADMIN))
1867                         return -EPERM;
1868                 err = copy_from_user(&rtmsg, arg,
1869                                      sizeof(struct in6_rtmsg));
1870                 if (err)
1871                         return -EFAULT;
1872
1873                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1874
1875                 rtnl_lock();
1876                 switch (cmd) {
1877                 case SIOCADDRT:
1878                         err = ip6_route_add(&cfg);
1879                         break;
1880                 case SIOCDELRT:
1881                         err = ip6_route_del(&cfg);
1882                         break;
1883                 default:
1884                         err = -EINVAL;
1885                 }
1886                 rtnl_unlock();
1887
1888                 return err;
1889         }
1890
1891         return -EINVAL;
1892 }
1893
1894 /*
1895  *      Drop the packet on the floor
1896  */
1897
1898 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1899 {
1900         int type;
1901         struct dst_entry *dst = skb_dst(skb);
1902         switch (ipstats_mib_noroutes) {
1903         case IPSTATS_MIB_INNOROUTES:
1904                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1905                 if (type == IPV6_ADDR_ANY) {
1906                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1907                                       IPSTATS_MIB_INADDRERRORS);
1908                         break;
1909                 }
1910                 /* FALLTHROUGH */
1911         case IPSTATS_MIB_OUTNOROUTES:
1912                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1913                               ipstats_mib_noroutes);
1914                 break;
1915         }
1916         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1917         kfree_skb(skb);
1918         return 0;
1919 }
1920
1921 static int ip6_pkt_discard(struct sk_buff *skb)
1922 {
1923         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1924 }
1925
1926 static int ip6_pkt_discard_out(struct sk_buff *skb)
1927 {
1928         skb->dev = skb_dst(skb)->dev;
1929         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1930 }
1931
1932 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1933
1934 static int ip6_pkt_prohibit(struct sk_buff *skb)
1935 {
1936         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1937 }
1938
1939 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1940 {
1941         skb->dev = skb_dst(skb)->dev;
1942         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1943 }
1944
1945 #endif
1946
1947 /*
1948  *      Allocate a dst for local (unicast / anycast) address.
1949  */
1950
1951 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1952                                     const struct in6_addr *addr,
1953                                     int anycast)
1954 {
1955         struct net *net = dev_net(idev->dev);
1956         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1957         struct neighbour *neigh;
1958
1959         if (rt == NULL) {
1960                 if (net_ratelimit())
1961                         pr_warning("IPv6:  Maximum number of routes reached,"
1962                                    " consider increasing route/max_size.\n");
1963                 return ERR_PTR(-ENOMEM);
1964         }
1965
1966         dev_hold(net->loopback_dev);
1967         in6_dev_hold(idev);
1968
1969         rt->dst.flags = DST_HOST;
1970         rt->dst.input = ip6_input;
1971         rt->dst.output = ip6_output;
1972         rt->rt6i_dev = net->loopback_dev;
1973         rt->rt6i_idev = idev;
1974         dst_metric_set(&rt->dst, RTAX_MTU, ipv6_get_mtu(rt->rt6i_dev));
1975         dst_metric_set(&rt->dst, RTAX_ADVMSS, ipv6_advmss(net, dst_mtu(&rt->dst)));
1976         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
1977         rt->dst.obsolete = -1;
1978
1979         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1980         if (anycast)
1981                 rt->rt6i_flags |= RTF_ANYCAST;
1982         else
1983                 rt->rt6i_flags |= RTF_LOCAL;
1984         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1985         if (IS_ERR(neigh)) {
1986                 dst_free(&rt->dst);
1987
1988                 /* We are casting this because that is the return
1989                  * value type.  But an errno encoded pointer is the
1990                  * same regardless of the underlying pointer type,
1991                  * and that's what we are returning.  So this is OK.
1992                  */
1993                 return (struct rt6_info *) neigh;
1994         }
1995         rt->rt6i_nexthop = neigh;
1996
1997         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1998         rt->rt6i_dst.plen = 128;
1999         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2000
2001         atomic_set(&rt->dst.__refcnt, 1);
2002
2003         return rt;
2004 }
2005
2006 struct arg_dev_net {
2007         struct net_device *dev;
2008         struct net *net;
2009 };
2010
2011 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2012 {
2013         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
2014         struct net *net = ((struct arg_dev_net *)arg)->net;
2015
2016         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2017             rt != net->ipv6.ip6_null_entry) {
2018                 RT6_TRACE("deleted by ifdown %p\n", rt);
2019                 return -1;
2020         }
2021         return 0;
2022 }
2023
2024 void rt6_ifdown(struct net *net, struct net_device *dev)
2025 {
2026         struct arg_dev_net adn = {
2027                 .dev = dev,
2028                 .net = net,
2029         };
2030
2031         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2032         icmp6_clean_all(fib6_ifdown, &adn);
2033 }
2034
2035 struct rt6_mtu_change_arg
2036 {
2037         struct net_device *dev;
2038         unsigned mtu;
2039 };
2040
2041 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2042 {
2043         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2044         struct inet6_dev *idev;
2045         struct net *net = dev_net(arg->dev);
2046
2047         /* In IPv6 pmtu discovery is not optional,
2048            so that RTAX_MTU lock cannot disable it.
2049            We still use this lock to block changes
2050            caused by addrconf/ndisc.
2051         */
2052
2053         idev = __in6_dev_get(arg->dev);
2054         if (idev == NULL)
2055                 return 0;
2056
2057         /* For administrative MTU increase, there is no way to discover
2058            IPv6 PMTU increase, so PMTU increase should be updated here.
2059            Since RFC 1981 doesn't include administrative MTU increase
2060            update PMTU increase is a MUST. (i.e. jumbo frame)
2061          */
2062         /*
2063            If new MTU is less than route PMTU, this new MTU will be the
2064            lowest MTU in the path, update the route PMTU to reflect PMTU
2065            decreases; if new MTU is greater than route PMTU, and the
2066            old MTU is the lowest MTU in the path, update the route PMTU
2067            to reflect the increase. In this case if the other nodes' MTU
2068            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2069            PMTU discouvery.
2070          */
2071         if (rt->rt6i_dev == arg->dev &&
2072             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2073             (dst_mtu(&rt->dst) >= arg->mtu ||
2074              (dst_mtu(&rt->dst) < arg->mtu &&
2075               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2076                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2077                 dst_metric_set(&rt->dst, RTAX_ADVMSS, ipv6_advmss(net, arg->mtu));
2078         }
2079         return 0;
2080 }
2081
2082 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2083 {
2084         struct rt6_mtu_change_arg arg = {
2085                 .dev = dev,
2086                 .mtu = mtu,
2087         };
2088
2089         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2090 }
2091
2092 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2093         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2094         [RTA_OIF]               = { .type = NLA_U32 },
2095         [RTA_IIF]               = { .type = NLA_U32 },
2096         [RTA_PRIORITY]          = { .type = NLA_U32 },
2097         [RTA_METRICS]           = { .type = NLA_NESTED },
2098 };
2099
2100 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2101                               struct fib6_config *cfg)
2102 {
2103         struct rtmsg *rtm;
2104         struct nlattr *tb[RTA_MAX+1];
2105         int err;
2106
2107         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2108         if (err < 0)
2109                 goto errout;
2110
2111         err = -EINVAL;
2112         rtm = nlmsg_data(nlh);
2113         memset(cfg, 0, sizeof(*cfg));
2114
2115         cfg->fc_table = rtm->rtm_table;
2116         cfg->fc_dst_len = rtm->rtm_dst_len;
2117         cfg->fc_src_len = rtm->rtm_src_len;
2118         cfg->fc_flags = RTF_UP;
2119         cfg->fc_protocol = rtm->rtm_protocol;
2120
2121         if (rtm->rtm_type == RTN_UNREACHABLE)
2122                 cfg->fc_flags |= RTF_REJECT;
2123
2124         if (rtm->rtm_type == RTN_LOCAL)
2125                 cfg->fc_flags |= RTF_LOCAL;
2126
2127         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2128         cfg->fc_nlinfo.nlh = nlh;
2129         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2130
2131         if (tb[RTA_GATEWAY]) {
2132                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2133                 cfg->fc_flags |= RTF_GATEWAY;
2134         }
2135
2136         if (tb[RTA_DST]) {
2137                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2138
2139                 if (nla_len(tb[RTA_DST]) < plen)
2140                         goto errout;
2141
2142                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2143         }
2144
2145         if (tb[RTA_SRC]) {
2146                 int plen = (rtm->rtm_src_len + 7) >> 3;
2147
2148                 if (nla_len(tb[RTA_SRC]) < plen)
2149                         goto errout;
2150
2151                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2152         }
2153
2154         if (tb[RTA_OIF])
2155                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2156
2157         if (tb[RTA_PRIORITY])
2158                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2159
2160         if (tb[RTA_METRICS]) {
2161                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2162                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2163         }
2164
2165         if (tb[RTA_TABLE])
2166                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2167
2168         err = 0;
2169 errout:
2170         return err;
2171 }
2172
2173 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2174 {
2175         struct fib6_config cfg;
2176         int err;
2177
2178         err = rtm_to_fib6_config(skb, nlh, &cfg);
2179         if (err < 0)
2180                 return err;
2181
2182         return ip6_route_del(&cfg);
2183 }
2184
2185 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2186 {
2187         struct fib6_config cfg;
2188         int err;
2189
2190         err = rtm_to_fib6_config(skb, nlh, &cfg);
2191         if (err < 0)
2192                 return err;
2193
2194         return ip6_route_add(&cfg);
2195 }
2196
2197 static inline size_t rt6_nlmsg_size(void)
2198 {
2199         return NLMSG_ALIGN(sizeof(struct rtmsg))
2200                + nla_total_size(16) /* RTA_SRC */
2201                + nla_total_size(16) /* RTA_DST */
2202                + nla_total_size(16) /* RTA_GATEWAY */
2203                + nla_total_size(16) /* RTA_PREFSRC */
2204                + nla_total_size(4) /* RTA_TABLE */
2205                + nla_total_size(4) /* RTA_IIF */
2206                + nla_total_size(4) /* RTA_OIF */
2207                + nla_total_size(4) /* RTA_PRIORITY */
2208                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2209                + nla_total_size(sizeof(struct rta_cacheinfo));
2210 }
2211
2212 static int rt6_fill_node(struct net *net,
2213                          struct sk_buff *skb, struct rt6_info *rt,
2214                          struct in6_addr *dst, struct in6_addr *src,
2215                          int iif, int type, u32 pid, u32 seq,
2216                          int prefix, int nowait, unsigned int flags)
2217 {
2218         struct rtmsg *rtm;
2219         struct nlmsghdr *nlh;
2220         long expires;
2221         u32 table;
2222
2223         if (prefix) {   /* user wants prefix routes only */
2224                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2225                         /* success since this is not a prefix route */
2226                         return 1;
2227                 }
2228         }
2229
2230         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2231         if (nlh == NULL)
2232                 return -EMSGSIZE;
2233
2234         rtm = nlmsg_data(nlh);
2235         rtm->rtm_family = AF_INET6;
2236         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2237         rtm->rtm_src_len = rt->rt6i_src.plen;
2238         rtm->rtm_tos = 0;
2239         if (rt->rt6i_table)
2240                 table = rt->rt6i_table->tb6_id;
2241         else
2242                 table = RT6_TABLE_UNSPEC;
2243         rtm->rtm_table = table;
2244         NLA_PUT_U32(skb, RTA_TABLE, table);
2245         if (rt->rt6i_flags&RTF_REJECT)
2246                 rtm->rtm_type = RTN_UNREACHABLE;
2247         else if (rt->rt6i_flags&RTF_LOCAL)
2248                 rtm->rtm_type = RTN_LOCAL;
2249         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2250                 rtm->rtm_type = RTN_LOCAL;
2251         else
2252                 rtm->rtm_type = RTN_UNICAST;
2253         rtm->rtm_flags = 0;
2254         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2255         rtm->rtm_protocol = rt->rt6i_protocol;
2256         if (rt->rt6i_flags&RTF_DYNAMIC)
2257                 rtm->rtm_protocol = RTPROT_REDIRECT;
2258         else if (rt->rt6i_flags & RTF_ADDRCONF)
2259                 rtm->rtm_protocol = RTPROT_KERNEL;
2260         else if (rt->rt6i_flags&RTF_DEFAULT)
2261                 rtm->rtm_protocol = RTPROT_RA;
2262
2263         if (rt->rt6i_flags&RTF_CACHE)
2264                 rtm->rtm_flags |= RTM_F_CLONED;
2265
2266         if (dst) {
2267                 NLA_PUT(skb, RTA_DST, 16, dst);
2268                 rtm->rtm_dst_len = 128;
2269         } else if (rtm->rtm_dst_len)
2270                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2271 #ifdef CONFIG_IPV6_SUBTREES
2272         if (src) {
2273                 NLA_PUT(skb, RTA_SRC, 16, src);
2274                 rtm->rtm_src_len = 128;
2275         } else if (rtm->rtm_src_len)
2276                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2277 #endif
2278         if (iif) {
2279 #ifdef CONFIG_IPV6_MROUTE
2280                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2281                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2282                         if (err <= 0) {
2283                                 if (!nowait) {
2284                                         if (err == 0)
2285                                                 return 0;
2286                                         goto nla_put_failure;
2287                                 } else {
2288                                         if (err == -EMSGSIZE)
2289                                                 goto nla_put_failure;
2290                                 }
2291                         }
2292                 } else
2293 #endif
2294                         NLA_PUT_U32(skb, RTA_IIF, iif);
2295         } else if (dst) {
2296                 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2297                 struct in6_addr saddr_buf;
2298                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2299                                        dst, 0, &saddr_buf) == 0)
2300                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2301         }
2302
2303         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2304                 goto nla_put_failure;
2305
2306         if (rt->dst.neighbour)
2307                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2308
2309         if (rt->dst.dev)
2310                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2311
2312         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2313
2314         if (!(rt->rt6i_flags & RTF_EXPIRES))
2315                 expires = 0;
2316         else if (rt->rt6i_expires - jiffies < INT_MAX)
2317                 expires = rt->rt6i_expires - jiffies;
2318         else
2319                 expires = INT_MAX;
2320
2321         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2322                                expires, rt->dst.error) < 0)
2323                 goto nla_put_failure;
2324
2325         return nlmsg_end(skb, nlh);
2326
2327 nla_put_failure:
2328         nlmsg_cancel(skb, nlh);
2329         return -EMSGSIZE;
2330 }
2331
2332 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2333 {
2334         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2335         int prefix;
2336
2337         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2338                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2339                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2340         } else
2341                 prefix = 0;
2342
2343         return rt6_fill_node(arg->net,
2344                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2345                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2346                      prefix, 0, NLM_F_MULTI);
2347 }
2348
2349 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2350 {
2351         struct net *net = sock_net(in_skb->sk);
2352         struct nlattr *tb[RTA_MAX+1];
2353         struct rt6_info *rt;
2354         struct sk_buff *skb;
2355         struct rtmsg *rtm;
2356         struct flowi fl;
2357         int err, iif = 0;
2358
2359         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2360         if (err < 0)
2361                 goto errout;
2362
2363         err = -EINVAL;
2364         memset(&fl, 0, sizeof(fl));
2365
2366         if (tb[RTA_SRC]) {
2367                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2368                         goto errout;
2369
2370                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2371         }
2372
2373         if (tb[RTA_DST]) {
2374                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2375                         goto errout;
2376
2377                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2378         }
2379
2380         if (tb[RTA_IIF])
2381                 iif = nla_get_u32(tb[RTA_IIF]);
2382
2383         if (tb[RTA_OIF])
2384                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2385
2386         if (iif) {
2387                 struct net_device *dev;
2388                 dev = __dev_get_by_index(net, iif);
2389                 if (!dev) {
2390                         err = -ENODEV;
2391                         goto errout;
2392                 }
2393         }
2394
2395         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2396         if (skb == NULL) {
2397                 err = -ENOBUFS;
2398                 goto errout;
2399         }
2400
2401         /* Reserve room for dummy headers, this skb can pass
2402            through good chunk of routing engine.
2403          */
2404         skb_reset_mac_header(skb);
2405         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2406
2407         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2408         skb_dst_set(skb, &rt->dst);
2409
2410         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2411                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2412                             nlh->nlmsg_seq, 0, 0, 0);
2413         if (err < 0) {
2414                 kfree_skb(skb);
2415                 goto errout;
2416         }
2417
2418         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2419 errout:
2420         return err;
2421 }
2422
2423 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2424 {
2425         struct sk_buff *skb;
2426         struct net *net = info->nl_net;
2427         u32 seq;
2428         int err;
2429
2430         err = -ENOBUFS;
2431         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2432
2433         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2434         if (skb == NULL)
2435                 goto errout;
2436
2437         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2438                                 event, info->pid, seq, 0, 0, 0);
2439         if (err < 0) {
2440                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2441                 WARN_ON(err == -EMSGSIZE);
2442                 kfree_skb(skb);
2443                 goto errout;
2444         }
2445         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2446                     info->nlh, gfp_any());
2447         return;
2448 errout:
2449         if (err < 0)
2450                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2451 }
2452
2453 static int ip6_route_dev_notify(struct notifier_block *this,
2454                                 unsigned long event, void *data)
2455 {
2456         struct net_device *dev = (struct net_device *)data;
2457         struct net *net = dev_net(dev);
2458
2459         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2460                 net->ipv6.ip6_null_entry->dst.dev = dev;
2461                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2462 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2463                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2464                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2465                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2466                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2467 #endif
2468         }
2469
2470         return NOTIFY_OK;
2471 }
2472
2473 /*
2474  *      /proc
2475  */
2476
2477 #ifdef CONFIG_PROC_FS
2478
2479 struct rt6_proc_arg
2480 {
2481         char *buffer;
2482         int offset;
2483         int length;
2484         int skip;
2485         int len;
2486 };
2487
2488 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2489 {
2490         struct seq_file *m = p_arg;
2491
2492         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2493
2494 #ifdef CONFIG_IPV6_SUBTREES
2495         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2496 #else
2497         seq_puts(m, "00000000000000000000000000000000 00 ");
2498 #endif
2499
2500         if (rt->rt6i_nexthop) {
2501                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2502         } else {
2503                 seq_puts(m, "00000000000000000000000000000000");
2504         }
2505         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2506                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2507                    rt->dst.__use, rt->rt6i_flags,
2508                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2509         return 0;
2510 }
2511
2512 static int ipv6_route_show(struct seq_file *m, void *v)
2513 {
2514         struct net *net = (struct net *)m->private;
2515         fib6_clean_all(net, rt6_info_route, 0, m);
2516         return 0;
2517 }
2518
2519 static int ipv6_route_open(struct inode *inode, struct file *file)
2520 {
2521         return single_open_net(inode, file, ipv6_route_show);
2522 }
2523
2524 static const struct file_operations ipv6_route_proc_fops = {
2525         .owner          = THIS_MODULE,
2526         .open           = ipv6_route_open,
2527         .read           = seq_read,
2528         .llseek         = seq_lseek,
2529         .release        = single_release_net,
2530 };
2531
2532 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2533 {
2534         struct net *net = (struct net *)seq->private;
2535         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2536                    net->ipv6.rt6_stats->fib_nodes,
2537                    net->ipv6.rt6_stats->fib_route_nodes,
2538                    net->ipv6.rt6_stats->fib_rt_alloc,
2539                    net->ipv6.rt6_stats->fib_rt_entries,
2540                    net->ipv6.rt6_stats->fib_rt_cache,
2541                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2542                    net->ipv6.rt6_stats->fib_discarded_routes);
2543
2544         return 0;
2545 }
2546
2547 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2548 {
2549         return single_open_net(inode, file, rt6_stats_seq_show);
2550 }
2551
2552 static const struct file_operations rt6_stats_seq_fops = {
2553         .owner   = THIS_MODULE,
2554         .open    = rt6_stats_seq_open,
2555         .read    = seq_read,
2556         .llseek  = seq_lseek,
2557         .release = single_release_net,
2558 };
2559 #endif  /* CONFIG_PROC_FS */
2560
2561 #ifdef CONFIG_SYSCTL
2562
2563 static
2564 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2565                               void __user *buffer, size_t *lenp, loff_t *ppos)
2566 {
2567         struct net *net = current->nsproxy->net_ns;
2568         int delay = net->ipv6.sysctl.flush_delay;
2569         if (write) {
2570                 proc_dointvec(ctl, write, buffer, lenp, ppos);
2571                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2572                 return 0;
2573         } else
2574                 return -EINVAL;
2575 }
2576
2577 ctl_table ipv6_route_table_template[] = {
2578         {
2579                 .procname       =       "flush",
2580                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2581                 .maxlen         =       sizeof(int),
2582                 .mode           =       0200,
2583                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2584         },
2585         {
2586                 .procname       =       "gc_thresh",
2587                 .data           =       &ip6_dst_ops_template.gc_thresh,
2588                 .maxlen         =       sizeof(int),
2589                 .mode           =       0644,
2590                 .proc_handler   =       proc_dointvec,
2591         },
2592         {
2593                 .procname       =       "max_size",
2594                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2595                 .maxlen         =       sizeof(int),
2596                 .mode           =       0644,
2597                 .proc_handler   =       proc_dointvec,
2598         },
2599         {
2600                 .procname       =       "gc_min_interval",
2601                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2602                 .maxlen         =       sizeof(int),
2603                 .mode           =       0644,
2604                 .proc_handler   =       proc_dointvec_jiffies,
2605         },
2606         {
2607                 .procname       =       "gc_timeout",
2608                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2609                 .maxlen         =       sizeof(int),
2610                 .mode           =       0644,
2611                 .proc_handler   =       proc_dointvec_jiffies,
2612         },
2613         {
2614                 .procname       =       "gc_interval",
2615                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2616                 .maxlen         =       sizeof(int),
2617                 .mode           =       0644,
2618                 .proc_handler   =       proc_dointvec_jiffies,
2619         },
2620         {
2621                 .procname       =       "gc_elasticity",
2622                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2623                 .maxlen         =       sizeof(int),
2624                 .mode           =       0644,
2625                 .proc_handler   =       proc_dointvec,
2626         },
2627         {
2628                 .procname       =       "mtu_expires",
2629                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2630                 .maxlen         =       sizeof(int),
2631                 .mode           =       0644,
2632                 .proc_handler   =       proc_dointvec_jiffies,
2633         },
2634         {
2635                 .procname       =       "min_adv_mss",
2636                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2637                 .maxlen         =       sizeof(int),
2638                 .mode           =       0644,
2639                 .proc_handler   =       proc_dointvec,
2640         },
2641         {
2642                 .procname       =       "gc_min_interval_ms",
2643                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2644                 .maxlen         =       sizeof(int),
2645                 .mode           =       0644,
2646                 .proc_handler   =       proc_dointvec_ms_jiffies,
2647         },
2648         { }
2649 };
2650
2651 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2652 {
2653         struct ctl_table *table;
2654
2655         table = kmemdup(ipv6_route_table_template,
2656                         sizeof(ipv6_route_table_template),
2657                         GFP_KERNEL);
2658
2659         if (table) {
2660                 table[0].data = &net->ipv6.sysctl.flush_delay;
2661                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2662                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2663                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2664                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2665                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2666                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2667                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2668                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2669                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2670         }
2671
2672         return table;
2673 }
2674 #endif
2675
2676 static int __net_init ip6_route_net_init(struct net *net)
2677 {
2678         int ret = -ENOMEM;
2679
2680         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2681                sizeof(net->ipv6.ip6_dst_ops));
2682
2683         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2684                 goto out_ip6_dst_ops;
2685
2686         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2687                                            sizeof(*net->ipv6.ip6_null_entry),
2688                                            GFP_KERNEL);
2689         if (!net->ipv6.ip6_null_entry)
2690                 goto out_ip6_dst_entries;
2691         net->ipv6.ip6_null_entry->dst.path =
2692                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2693         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2694         dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255);
2695
2696 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2697         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2698                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2699                                                GFP_KERNEL);
2700         if (!net->ipv6.ip6_prohibit_entry)
2701                 goto out_ip6_null_entry;
2702         net->ipv6.ip6_prohibit_entry->dst.path =
2703                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2704         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2705         dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255);
2706
2707         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2708                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2709                                                GFP_KERNEL);
2710         if (!net->ipv6.ip6_blk_hole_entry)
2711                 goto out_ip6_prohibit_entry;
2712         net->ipv6.ip6_blk_hole_entry->dst.path =
2713                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2714         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2715         dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255);
2716 #endif
2717
2718         net->ipv6.sysctl.flush_delay = 0;
2719         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2720         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2721         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2722         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2723         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2724         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2725         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2726
2727 #ifdef CONFIG_PROC_FS
2728         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2729         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2730 #endif
2731         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2732
2733         ret = 0;
2734 out:
2735         return ret;
2736
2737 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2738 out_ip6_prohibit_entry:
2739         kfree(net->ipv6.ip6_prohibit_entry);
2740 out_ip6_null_entry:
2741         kfree(net->ipv6.ip6_null_entry);
2742 #endif
2743 out_ip6_dst_entries:
2744         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2745 out_ip6_dst_ops:
2746         goto out;
2747 }
2748
2749 static void __net_exit ip6_route_net_exit(struct net *net)
2750 {
2751 #ifdef CONFIG_PROC_FS
2752         proc_net_remove(net, "ipv6_route");
2753         proc_net_remove(net, "rt6_stats");
2754 #endif
2755         kfree(net->ipv6.ip6_null_entry);
2756 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2757         kfree(net->ipv6.ip6_prohibit_entry);
2758         kfree(net->ipv6.ip6_blk_hole_entry);
2759 #endif
2760         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2761 }
2762
2763 static struct pernet_operations ip6_route_net_ops = {
2764         .init = ip6_route_net_init,
2765         .exit = ip6_route_net_exit,
2766 };
2767
2768 static struct notifier_block ip6_route_dev_notifier = {
2769         .notifier_call = ip6_route_dev_notify,
2770         .priority = 0,
2771 };
2772
2773 int __init ip6_route_init(void)
2774 {
2775         int ret;
2776
2777         ret = -ENOMEM;
2778         ip6_dst_ops_template.kmem_cachep =
2779                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2780                                   SLAB_HWCACHE_ALIGN, NULL);
2781         if (!ip6_dst_ops_template.kmem_cachep)
2782                 goto out;
2783
2784         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2785         if (ret)
2786                 goto out_kmem_cache;
2787
2788         ret = register_pernet_subsys(&ip6_route_net_ops);
2789         if (ret)
2790                 goto out_dst_entries;
2791
2792         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2793
2794         /* Registering of the loopback is done before this portion of code,
2795          * the loopback reference in rt6_info will not be taken, do it
2796          * manually for init_net */
2797         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2798         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2799   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2800         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2801         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2802         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2803         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2804   #endif
2805         ret = fib6_init();
2806         if (ret)
2807                 goto out_register_subsys;
2808
2809         ret = xfrm6_init();
2810         if (ret)
2811                 goto out_fib6_init;
2812
2813         ret = fib6_rules_init();
2814         if (ret)
2815                 goto xfrm6_init;
2816
2817         ret = -ENOBUFS;
2818         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2819             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2820             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2821                 goto fib6_rules_init;
2822
2823         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2824         if (ret)
2825                 goto fib6_rules_init;
2826
2827 out:
2828         return ret;
2829
2830 fib6_rules_init:
2831         fib6_rules_cleanup();
2832 xfrm6_init:
2833         xfrm6_fini();
2834 out_fib6_init:
2835         fib6_gc_cleanup();
2836 out_register_subsys:
2837         unregister_pernet_subsys(&ip6_route_net_ops);
2838 out_dst_entries:
2839         dst_entries_destroy(&ip6_dst_blackhole_ops);
2840 out_kmem_cache:
2841         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2842         goto out;
2843 }
2844
2845 void ip6_route_cleanup(void)
2846 {
2847         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2848         fib6_rules_cleanup();
2849         xfrm6_fini();
2850         fib6_gc_cleanup();
2851         unregister_pernet_subsys(&ip6_route_net_ops);
2852         dst_entries_destroy(&ip6_dst_blackhole_ops);
2853         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2854 }