]> git.karo-electronics.de Git - mv-sheeva.git/blob - net/ipv6/route.c
d9cb832be52919960b0d11e902372fd1f286ffc8
[mv-sheeva.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void             ip6_dst_destroy(struct dst_entry *);
82 static void             ip6_dst_ifdown(struct dst_entry *,
83                                        struct net_device *dev, int how);
84 static int               ip6_dst_gc(struct dst_ops *ops);
85
86 static int              ip6_pkt_discard(struct sk_buff *skb);
87 static int              ip6_pkt_discard_out(struct sk_buff *skb);
88 static void             ip6_link_failure(struct sk_buff *skb);
89 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90
91 #ifdef CONFIG_IPV6_ROUTE_INFO
92 static struct rt6_info *rt6_add_route_info(struct net *net,
93                                            struct in6_addr *prefix, int prefixlen,
94                                            struct in6_addr *gwaddr, int ifindex,
95                                            unsigned pref);
96 static struct rt6_info *rt6_get_route_info(struct net *net,
97                                            struct in6_addr *prefix, int prefixlen,
98                                            struct in6_addr *gwaddr, int ifindex);
99 #endif
100
101 static struct dst_ops ip6_dst_ops_template = {
102         .family                 =       AF_INET6,
103         .protocol               =       cpu_to_be16(ETH_P_IPV6),
104         .gc                     =       ip6_dst_gc,
105         .gc_thresh              =       1024,
106         .check                  =       ip6_dst_check,
107         .default_advmss         =       ip6_default_advmss,
108         .destroy                =       ip6_dst_destroy,
109         .ifdown                 =       ip6_dst_ifdown,
110         .negative_advice        =       ip6_negative_advice,
111         .link_failure           =       ip6_link_failure,
112         .update_pmtu            =       ip6_rt_update_pmtu,
113         .local_out              =       __ip6_local_out,
114 };
115
116 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
117 {
118 }
119
120 static struct dst_ops ip6_dst_blackhole_ops = {
121         .family                 =       AF_INET6,
122         .protocol               =       cpu_to_be16(ETH_P_IPV6),
123         .destroy                =       ip6_dst_destroy,
124         .check                  =       ip6_dst_check,
125         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
126 };
127
128 static struct rt6_info ip6_null_entry_template = {
129         .dst = {
130                 .__refcnt       = ATOMIC_INIT(1),
131                 .__use          = 1,
132                 .obsolete       = -1,
133                 .error          = -ENETUNREACH,
134                 .input          = ip6_pkt_discard,
135                 .output         = ip6_pkt_discard_out,
136         },
137         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
138         .rt6i_protocol  = RTPROT_KERNEL,
139         .rt6i_metric    = ~(u32) 0,
140         .rt6i_ref       = ATOMIC_INIT(1),
141 };
142
143 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
144
145 static int ip6_pkt_prohibit(struct sk_buff *skb);
146 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
147
148 static struct rt6_info ip6_prohibit_entry_template = {
149         .dst = {
150                 .__refcnt       = ATOMIC_INIT(1),
151                 .__use          = 1,
152                 .obsolete       = -1,
153                 .error          = -EACCES,
154                 .input          = ip6_pkt_prohibit,
155                 .output         = ip6_pkt_prohibit_out,
156         },
157         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
158         .rt6i_protocol  = RTPROT_KERNEL,
159         .rt6i_metric    = ~(u32) 0,
160         .rt6i_ref       = ATOMIC_INIT(1),
161 };
162
163 static struct rt6_info ip6_blk_hole_entry_template = {
164         .dst = {
165                 .__refcnt       = ATOMIC_INIT(1),
166                 .__use          = 1,
167                 .obsolete       = -1,
168                 .error          = -EINVAL,
169                 .input          = dst_discard,
170                 .output         = dst_discard,
171         },
172         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
173         .rt6i_protocol  = RTPROT_KERNEL,
174         .rt6i_metric    = ~(u32) 0,
175         .rt6i_ref       = ATOMIC_INIT(1),
176 };
177
178 #endif
179
180 /* allocate dst with ip6_dst_ops */
181 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
182 {
183         return (struct rt6_info *)dst_alloc(ops);
184 }
185
186 static void ip6_dst_destroy(struct dst_entry *dst)
187 {
188         struct rt6_info *rt = (struct rt6_info *)dst;
189         struct inet6_dev *idev = rt->rt6i_idev;
190         struct inet_peer *peer = rt->rt6i_peer;
191
192         if (idev != NULL) {
193                 rt->rt6i_idev = NULL;
194                 in6_dev_put(idev);
195         }
196         if (peer) {
197                 BUG_ON(!(rt->rt6i_flags & RTF_CACHE));
198                 rt->rt6i_peer = NULL;
199                 inet_putpeer(peer);
200         }
201 }
202
203 void rt6_bind_peer(struct rt6_info *rt, int create)
204 {
205         struct inet_peer *peer;
206
207         if (WARN_ON(!(rt->rt6i_flags & RTF_CACHE)))
208                 return;
209
210         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
211         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
212                 inet_putpeer(peer);
213 }
214
215 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
216                            int how)
217 {
218         struct rt6_info *rt = (struct rt6_info *)dst;
219         struct inet6_dev *idev = rt->rt6i_idev;
220         struct net_device *loopback_dev =
221                 dev_net(dev)->loopback_dev;
222
223         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
224                 struct inet6_dev *loopback_idev =
225                         in6_dev_get(loopback_dev);
226                 if (loopback_idev != NULL) {
227                         rt->rt6i_idev = loopback_idev;
228                         in6_dev_put(idev);
229                 }
230         }
231 }
232
233 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
234 {
235         return (rt->rt6i_flags & RTF_EXPIRES) &&
236                 time_after(jiffies, rt->rt6i_expires);
237 }
238
239 static inline int rt6_need_strict(struct in6_addr *daddr)
240 {
241         return ipv6_addr_type(daddr) &
242                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
243 }
244
245 /*
246  *      Route lookup. Any table->tb6_lock is implied.
247  */
248
249 static inline struct rt6_info *rt6_device_match(struct net *net,
250                                                     struct rt6_info *rt,
251                                                     struct in6_addr *saddr,
252                                                     int oif,
253                                                     int flags)
254 {
255         struct rt6_info *local = NULL;
256         struct rt6_info *sprt;
257
258         if (!oif && ipv6_addr_any(saddr))
259                 goto out;
260
261         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
262                 struct net_device *dev = sprt->rt6i_dev;
263
264                 if (oif) {
265                         if (dev->ifindex == oif)
266                                 return sprt;
267                         if (dev->flags & IFF_LOOPBACK) {
268                                 if (sprt->rt6i_idev == NULL ||
269                                     sprt->rt6i_idev->dev->ifindex != oif) {
270                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
271                                                 continue;
272                                         if (local && (!oif ||
273                                                       local->rt6i_idev->dev->ifindex == oif))
274                                                 continue;
275                                 }
276                                 local = sprt;
277                         }
278                 } else {
279                         if (ipv6_chk_addr(net, saddr, dev,
280                                           flags & RT6_LOOKUP_F_IFACE))
281                                 return sprt;
282                 }
283         }
284
285         if (oif) {
286                 if (local)
287                         return local;
288
289                 if (flags & RT6_LOOKUP_F_IFACE)
290                         return net->ipv6.ip6_null_entry;
291         }
292 out:
293         return rt;
294 }
295
296 #ifdef CONFIG_IPV6_ROUTER_PREF
297 static void rt6_probe(struct rt6_info *rt)
298 {
299         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
300         /*
301          * Okay, this does not seem to be appropriate
302          * for now, however, we need to check if it
303          * is really so; aka Router Reachability Probing.
304          *
305          * Router Reachability Probe MUST be rate-limited
306          * to no more than one per minute.
307          */
308         if (!neigh || (neigh->nud_state & NUD_VALID))
309                 return;
310         read_lock_bh(&neigh->lock);
311         if (!(neigh->nud_state & NUD_VALID) &&
312             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
313                 struct in6_addr mcaddr;
314                 struct in6_addr *target;
315
316                 neigh->updated = jiffies;
317                 read_unlock_bh(&neigh->lock);
318
319                 target = (struct in6_addr *)&neigh->primary_key;
320                 addrconf_addr_solict_mult(target, &mcaddr);
321                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
322         } else
323                 read_unlock_bh(&neigh->lock);
324 }
325 #else
326 static inline void rt6_probe(struct rt6_info *rt)
327 {
328 }
329 #endif
330
331 /*
332  * Default Router Selection (RFC 2461 6.3.6)
333  */
334 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
335 {
336         struct net_device *dev = rt->rt6i_dev;
337         if (!oif || dev->ifindex == oif)
338                 return 2;
339         if ((dev->flags & IFF_LOOPBACK) &&
340             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
341                 return 1;
342         return 0;
343 }
344
345 static inline int rt6_check_neigh(struct rt6_info *rt)
346 {
347         struct neighbour *neigh = rt->rt6i_nexthop;
348         int m;
349         if (rt->rt6i_flags & RTF_NONEXTHOP ||
350             !(rt->rt6i_flags & RTF_GATEWAY))
351                 m = 1;
352         else if (neigh) {
353                 read_lock_bh(&neigh->lock);
354                 if (neigh->nud_state & NUD_VALID)
355                         m = 2;
356 #ifdef CONFIG_IPV6_ROUTER_PREF
357                 else if (neigh->nud_state & NUD_FAILED)
358                         m = 0;
359 #endif
360                 else
361                         m = 1;
362                 read_unlock_bh(&neigh->lock);
363         } else
364                 m = 0;
365         return m;
366 }
367
368 static int rt6_score_route(struct rt6_info *rt, int oif,
369                            int strict)
370 {
371         int m, n;
372
373         m = rt6_check_dev(rt, oif);
374         if (!m && (strict & RT6_LOOKUP_F_IFACE))
375                 return -1;
376 #ifdef CONFIG_IPV6_ROUTER_PREF
377         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
378 #endif
379         n = rt6_check_neigh(rt);
380         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
381                 return -1;
382         return m;
383 }
384
385 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
386                                    int *mpri, struct rt6_info *match)
387 {
388         int m;
389
390         if (rt6_check_expired(rt))
391                 goto out;
392
393         m = rt6_score_route(rt, oif, strict);
394         if (m < 0)
395                 goto out;
396
397         if (m > *mpri) {
398                 if (strict & RT6_LOOKUP_F_REACHABLE)
399                         rt6_probe(match);
400                 *mpri = m;
401                 match = rt;
402         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
403                 rt6_probe(rt);
404         }
405
406 out:
407         return match;
408 }
409
410 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
411                                      struct rt6_info *rr_head,
412                                      u32 metric, int oif, int strict)
413 {
414         struct rt6_info *rt, *match;
415         int mpri = -1;
416
417         match = NULL;
418         for (rt = rr_head; rt && rt->rt6i_metric == metric;
419              rt = rt->dst.rt6_next)
420                 match = find_match(rt, oif, strict, &mpri, match);
421         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
422              rt = rt->dst.rt6_next)
423                 match = find_match(rt, oif, strict, &mpri, match);
424
425         return match;
426 }
427
428 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
429 {
430         struct rt6_info *match, *rt0;
431         struct net *net;
432
433         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
434                   __func__, fn->leaf, oif);
435
436         rt0 = fn->rr_ptr;
437         if (!rt0)
438                 fn->rr_ptr = rt0 = fn->leaf;
439
440         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
441
442         if (!match &&
443             (strict & RT6_LOOKUP_F_REACHABLE)) {
444                 struct rt6_info *next = rt0->dst.rt6_next;
445
446                 /* no entries matched; do round-robin */
447                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
448                         next = fn->leaf;
449
450                 if (next != rt0)
451                         fn->rr_ptr = next;
452         }
453
454         RT6_TRACE("%s() => %p\n",
455                   __func__, match);
456
457         net = dev_net(rt0->rt6i_dev);
458         return match ? match : net->ipv6.ip6_null_entry;
459 }
460
461 #ifdef CONFIG_IPV6_ROUTE_INFO
462 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
463                   struct in6_addr *gwaddr)
464 {
465         struct net *net = dev_net(dev);
466         struct route_info *rinfo = (struct route_info *) opt;
467         struct in6_addr prefix_buf, *prefix;
468         unsigned int pref;
469         unsigned long lifetime;
470         struct rt6_info *rt;
471
472         if (len < sizeof(struct route_info)) {
473                 return -EINVAL;
474         }
475
476         /* Sanity check for prefix_len and length */
477         if (rinfo->length > 3) {
478                 return -EINVAL;
479         } else if (rinfo->prefix_len > 128) {
480                 return -EINVAL;
481         } else if (rinfo->prefix_len > 64) {
482                 if (rinfo->length < 2) {
483                         return -EINVAL;
484                 }
485         } else if (rinfo->prefix_len > 0) {
486                 if (rinfo->length < 1) {
487                         return -EINVAL;
488                 }
489         }
490
491         pref = rinfo->route_pref;
492         if (pref == ICMPV6_ROUTER_PREF_INVALID)
493                 return -EINVAL;
494
495         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
496
497         if (rinfo->length == 3)
498                 prefix = (struct in6_addr *)rinfo->prefix;
499         else {
500                 /* this function is safe */
501                 ipv6_addr_prefix(&prefix_buf,
502                                  (struct in6_addr *)rinfo->prefix,
503                                  rinfo->prefix_len);
504                 prefix = &prefix_buf;
505         }
506
507         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
508                                 dev->ifindex);
509
510         if (rt && !lifetime) {
511                 ip6_del_rt(rt);
512                 rt = NULL;
513         }
514
515         if (!rt && lifetime)
516                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
517                                         pref);
518         else if (rt)
519                 rt->rt6i_flags = RTF_ROUTEINFO |
520                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
521
522         if (rt) {
523                 if (!addrconf_finite_timeout(lifetime)) {
524                         rt->rt6i_flags &= ~RTF_EXPIRES;
525                 } else {
526                         rt->rt6i_expires = jiffies + HZ * lifetime;
527                         rt->rt6i_flags |= RTF_EXPIRES;
528                 }
529                 dst_release(&rt->dst);
530         }
531         return 0;
532 }
533 #endif
534
535 #define BACKTRACK(__net, saddr)                 \
536 do { \
537         if (rt == __net->ipv6.ip6_null_entry) { \
538                 struct fib6_node *pn; \
539                 while (1) { \
540                         if (fn->fn_flags & RTN_TL_ROOT) \
541                                 goto out; \
542                         pn = fn->parent; \
543                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
544                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
545                         else \
546                                 fn = pn; \
547                         if (fn->fn_flags & RTN_RTINFO) \
548                                 goto restart; \
549                 } \
550         } \
551 } while(0)
552
553 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
554                                              struct fib6_table *table,
555                                              struct flowi *fl, int flags)
556 {
557         struct fib6_node *fn;
558         struct rt6_info *rt;
559
560         read_lock_bh(&table->tb6_lock);
561         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
562 restart:
563         rt = fn->leaf;
564         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
565         BACKTRACK(net, &fl->fl6_src);
566 out:
567         dst_use(&rt->dst, jiffies);
568         read_unlock_bh(&table->tb6_lock);
569         return rt;
570
571 }
572
573 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
574                             const struct in6_addr *saddr, int oif, int strict)
575 {
576         struct flowi fl = {
577                 .oif = oif,
578                 .fl6_dst = *daddr,
579         };
580         struct dst_entry *dst;
581         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
582
583         if (saddr) {
584                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
585                 flags |= RT6_LOOKUP_F_HAS_SADDR;
586         }
587
588         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
589         if (dst->error == 0)
590                 return (struct rt6_info *) dst;
591
592         dst_release(dst);
593
594         return NULL;
595 }
596
597 EXPORT_SYMBOL(rt6_lookup);
598
599 /* ip6_ins_rt is called with FREE table->tb6_lock.
600    It takes new route entry, the addition fails by any reason the
601    route is freed. In any case, if caller does not hold it, it may
602    be destroyed.
603  */
604
605 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
606 {
607         int err;
608         struct fib6_table *table;
609
610         table = rt->rt6i_table;
611         write_lock_bh(&table->tb6_lock);
612         err = fib6_add(&table->tb6_root, rt, info);
613         write_unlock_bh(&table->tb6_lock);
614
615         return err;
616 }
617
618 int ip6_ins_rt(struct rt6_info *rt)
619 {
620         struct nl_info info = {
621                 .nl_net = dev_net(rt->rt6i_dev),
622         };
623         return __ip6_ins_rt(rt, &info);
624 }
625
626 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
627                                       struct in6_addr *saddr)
628 {
629         struct rt6_info *rt;
630
631         /*
632          *      Clone the route.
633          */
634
635         rt = ip6_rt_copy(ort);
636
637         if (rt) {
638                 struct neighbour *neigh;
639                 int attempts = !in_softirq();
640
641                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
642                         if (rt->rt6i_dst.plen != 128 &&
643                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
644                                 rt->rt6i_flags |= RTF_ANYCAST;
645                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
646                 }
647
648                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
649                 rt->rt6i_dst.plen = 128;
650                 rt->rt6i_flags |= RTF_CACHE;
651                 rt->dst.flags |= DST_HOST;
652
653 #ifdef CONFIG_IPV6_SUBTREES
654                 if (rt->rt6i_src.plen && saddr) {
655                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
656                         rt->rt6i_src.plen = 128;
657                 }
658 #endif
659
660         retry:
661                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
662                 if (IS_ERR(neigh)) {
663                         struct net *net = dev_net(rt->rt6i_dev);
664                         int saved_rt_min_interval =
665                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
666                         int saved_rt_elasticity =
667                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
668
669                         if (attempts-- > 0) {
670                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
671                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
672
673                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
674
675                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
676                                         saved_rt_elasticity;
677                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
678                                         saved_rt_min_interval;
679                                 goto retry;
680                         }
681
682                         if (net_ratelimit())
683                                 printk(KERN_WARNING
684                                        "ipv6: Neighbour table overflow.\n");
685                         dst_free(&rt->dst);
686                         return NULL;
687                 }
688                 rt->rt6i_nexthop = neigh;
689
690         }
691
692         return rt;
693 }
694
695 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
696 {
697         struct rt6_info *rt = ip6_rt_copy(ort);
698         if (rt) {
699                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
700                 rt->rt6i_dst.plen = 128;
701                 rt->rt6i_flags |= RTF_CACHE;
702                 rt->dst.flags |= DST_HOST;
703                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
704         }
705         return rt;
706 }
707
708 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
709                                       struct flowi *fl, int flags)
710 {
711         struct fib6_node *fn;
712         struct rt6_info *rt, *nrt;
713         int strict = 0;
714         int attempts = 3;
715         int err;
716         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
717
718         strict |= flags & RT6_LOOKUP_F_IFACE;
719
720 relookup:
721         read_lock_bh(&table->tb6_lock);
722
723 restart_2:
724         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
725
726 restart:
727         rt = rt6_select(fn, oif, strict | reachable);
728
729         BACKTRACK(net, &fl->fl6_src);
730         if (rt == net->ipv6.ip6_null_entry ||
731             rt->rt6i_flags & RTF_CACHE)
732                 goto out;
733
734         dst_hold(&rt->dst);
735         read_unlock_bh(&table->tb6_lock);
736
737         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
738                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
739         else {
740 #if CLONE_OFFLINK_ROUTE
741                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
742 #else
743                 goto out2;
744 #endif
745         }
746
747         dst_release(&rt->dst);
748         rt = nrt ? : net->ipv6.ip6_null_entry;
749
750         dst_hold(&rt->dst);
751         if (nrt) {
752                 err = ip6_ins_rt(nrt);
753                 if (!err)
754                         goto out2;
755         }
756
757         if (--attempts <= 0)
758                 goto out2;
759
760         /*
761          * Race condition! In the gap, when table->tb6_lock was
762          * released someone could insert this route.  Relookup.
763          */
764         dst_release(&rt->dst);
765         goto relookup;
766
767 out:
768         if (reachable) {
769                 reachable = 0;
770                 goto restart_2;
771         }
772         dst_hold(&rt->dst);
773         read_unlock_bh(&table->tb6_lock);
774 out2:
775         rt->dst.lastuse = jiffies;
776         rt->dst.__use++;
777
778         return rt;
779 }
780
781 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
782                                             struct flowi *fl, int flags)
783 {
784         return ip6_pol_route(net, table, fl->iif, fl, flags);
785 }
786
787 void ip6_route_input(struct sk_buff *skb)
788 {
789         struct ipv6hdr *iph = ipv6_hdr(skb);
790         struct net *net = dev_net(skb->dev);
791         int flags = RT6_LOOKUP_F_HAS_SADDR;
792         struct flowi fl = {
793                 .iif = skb->dev->ifindex,
794                 .fl6_dst = iph->daddr,
795                 .fl6_src = iph->saddr,
796                 .fl6_flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
797                 .mark = skb->mark,
798                 .proto = iph->nexthdr,
799         };
800
801         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
802                 flags |= RT6_LOOKUP_F_IFACE;
803
804         skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
805 }
806
807 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
808                                              struct flowi *fl, int flags)
809 {
810         return ip6_pol_route(net, table, fl->oif, fl, flags);
811 }
812
813 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
814                                     struct flowi *fl)
815 {
816         int flags = 0;
817
818         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
819                 flags |= RT6_LOOKUP_F_IFACE;
820
821         if (!ipv6_addr_any(&fl->fl6_src))
822                 flags |= RT6_LOOKUP_F_HAS_SADDR;
823         else if (sk)
824                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
825
826         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
827 }
828
829 EXPORT_SYMBOL(ip6_route_output);
830
831 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
832 {
833         struct rt6_info *ort = (struct rt6_info *) *dstp;
834         struct rt6_info *rt = (struct rt6_info *)
835                 dst_alloc(&ip6_dst_blackhole_ops);
836         struct dst_entry *new = NULL;
837
838         if (rt) {
839                 new = &rt->dst;
840
841                 atomic_set(&new->__refcnt, 1);
842                 new->__use = 1;
843                 new->input = dst_discard;
844                 new->output = dst_discard;
845
846                 dst_copy_metrics(new, &ort->dst);
847                 new->dev = ort->dst.dev;
848                 if (new->dev)
849                         dev_hold(new->dev);
850                 rt->rt6i_idev = ort->rt6i_idev;
851                 if (rt->rt6i_idev)
852                         in6_dev_hold(rt->rt6i_idev);
853                 rt->rt6i_expires = 0;
854
855                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
856                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
857                 rt->rt6i_metric = 0;
858
859                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
860 #ifdef CONFIG_IPV6_SUBTREES
861                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
862 #endif
863
864                 dst_free(new);
865         }
866
867         dst_release(*dstp);
868         *dstp = new;
869         return new ? 0 : -ENOMEM;
870 }
871 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
872
873 /*
874  *      Destination cache support functions
875  */
876
877 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
878 {
879         struct rt6_info *rt;
880
881         rt = (struct rt6_info *) dst;
882
883         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
884                 return dst;
885
886         return NULL;
887 }
888
889 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
890 {
891         struct rt6_info *rt = (struct rt6_info *) dst;
892
893         if (rt) {
894                 if (rt->rt6i_flags & RTF_CACHE) {
895                         if (rt6_check_expired(rt)) {
896                                 ip6_del_rt(rt);
897                                 dst = NULL;
898                         }
899                 } else {
900                         dst_release(dst);
901                         dst = NULL;
902                 }
903         }
904         return dst;
905 }
906
907 static void ip6_link_failure(struct sk_buff *skb)
908 {
909         struct rt6_info *rt;
910
911         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
912
913         rt = (struct rt6_info *) skb_dst(skb);
914         if (rt) {
915                 if (rt->rt6i_flags&RTF_CACHE) {
916                         dst_set_expires(&rt->dst, 0);
917                         rt->rt6i_flags |= RTF_EXPIRES;
918                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
919                         rt->rt6i_node->fn_sernum = -1;
920         }
921 }
922
923 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
924 {
925         struct rt6_info *rt6 = (struct rt6_info*)dst;
926
927         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
928                 rt6->rt6i_flags |= RTF_MODIFIED;
929                 if (mtu < IPV6_MIN_MTU) {
930                         u32 features = dst_metric(dst, RTAX_FEATURES);
931                         mtu = IPV6_MIN_MTU;
932                         features |= RTAX_FEATURE_ALLFRAG;
933                         dst_metric_set(dst, RTAX_FEATURES, features);
934                 }
935                 dst_metric_set(dst, RTAX_MTU, mtu);
936                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
937         }
938 }
939
940 static int ipv6_get_mtu(struct net_device *dev);
941
942 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
943 {
944         struct net_device *dev = dst->dev;
945         unsigned int mtu = dst_mtu(dst);
946         struct net *net = dev_net(dev);
947
948         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
949
950         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
951                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
952
953         /*
954          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
955          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
956          * IPV6_MAXPLEN is also valid and means: "any MSS,
957          * rely only on pmtu discovery"
958          */
959         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
960                 mtu = IPV6_MAXPLEN;
961         return mtu;
962 }
963
964 static struct dst_entry *icmp6_dst_gc_list;
965 static DEFINE_SPINLOCK(icmp6_dst_lock);
966
967 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
968                                   struct neighbour *neigh,
969                                   const struct in6_addr *addr)
970 {
971         struct rt6_info *rt;
972         struct inet6_dev *idev = in6_dev_get(dev);
973         struct net *net = dev_net(dev);
974
975         if (unlikely(idev == NULL))
976                 return NULL;
977
978         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
979         if (unlikely(rt == NULL)) {
980                 in6_dev_put(idev);
981                 goto out;
982         }
983
984         dev_hold(dev);
985         if (neigh)
986                 neigh_hold(neigh);
987         else {
988                 neigh = ndisc_get_neigh(dev, addr);
989                 if (IS_ERR(neigh))
990                         neigh = NULL;
991         }
992
993         rt->rt6i_dev      = dev;
994         rt->rt6i_idev     = idev;
995         rt->rt6i_nexthop  = neigh;
996         atomic_set(&rt->dst.__refcnt, 1);
997         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
998         dst_metric_set(&rt->dst, RTAX_MTU, ipv6_get_mtu(rt->rt6i_dev));
999         rt->dst.output  = ip6_output;
1000
1001 #if 0   /* there's no chance to use these for ndisc */
1002         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1003                                 ? DST_HOST
1004                                 : 0;
1005         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1006         rt->rt6i_dst.plen = 128;
1007 #endif
1008
1009         spin_lock_bh(&icmp6_dst_lock);
1010         rt->dst.next = icmp6_dst_gc_list;
1011         icmp6_dst_gc_list = &rt->dst;
1012         spin_unlock_bh(&icmp6_dst_lock);
1013
1014         fib6_force_start_gc(net);
1015
1016 out:
1017         return &rt->dst;
1018 }
1019
1020 int icmp6_dst_gc(void)
1021 {
1022         struct dst_entry *dst, *next, **pprev;
1023         int more = 0;
1024
1025         next = NULL;
1026
1027         spin_lock_bh(&icmp6_dst_lock);
1028         pprev = &icmp6_dst_gc_list;
1029
1030         while ((dst = *pprev) != NULL) {
1031                 if (!atomic_read(&dst->__refcnt)) {
1032                         *pprev = dst->next;
1033                         dst_free(dst);
1034                 } else {
1035                         pprev = &dst->next;
1036                         ++more;
1037                 }
1038         }
1039
1040         spin_unlock_bh(&icmp6_dst_lock);
1041
1042         return more;
1043 }
1044
1045 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1046                             void *arg)
1047 {
1048         struct dst_entry *dst, **pprev;
1049
1050         spin_lock_bh(&icmp6_dst_lock);
1051         pprev = &icmp6_dst_gc_list;
1052         while ((dst = *pprev) != NULL) {
1053                 struct rt6_info *rt = (struct rt6_info *) dst;
1054                 if (func(rt, arg)) {
1055                         *pprev = dst->next;
1056                         dst_free(dst);
1057                 } else {
1058                         pprev = &dst->next;
1059                 }
1060         }
1061         spin_unlock_bh(&icmp6_dst_lock);
1062 }
1063
1064 static int ip6_dst_gc(struct dst_ops *ops)
1065 {
1066         unsigned long now = jiffies;
1067         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1068         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1069         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1070         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1071         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1072         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1073         int entries;
1074
1075         entries = dst_entries_get_fast(ops);
1076         if (time_after(rt_last_gc + rt_min_interval, now) &&
1077             entries <= rt_max_size)
1078                 goto out;
1079
1080         net->ipv6.ip6_rt_gc_expire++;
1081         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1082         net->ipv6.ip6_rt_last_gc = now;
1083         entries = dst_entries_get_slow(ops);
1084         if (entries < ops->gc_thresh)
1085                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1086 out:
1087         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1088         return entries > rt_max_size;
1089 }
1090
1091 /* Clean host part of a prefix. Not necessary in radix tree,
1092    but results in cleaner routing tables.
1093
1094    Remove it only when all the things will work!
1095  */
1096
1097 static int ipv6_get_mtu(struct net_device *dev)
1098 {
1099         int mtu = IPV6_MIN_MTU;
1100         struct inet6_dev *idev;
1101
1102         rcu_read_lock();
1103         idev = __in6_dev_get(dev);
1104         if (idev)
1105                 mtu = idev->cnf.mtu6;
1106         rcu_read_unlock();
1107         return mtu;
1108 }
1109
1110 int ip6_dst_hoplimit(struct dst_entry *dst)
1111 {
1112         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1113         if (hoplimit == 0) {
1114                 struct net_device *dev = dst->dev;
1115                 struct inet6_dev *idev;
1116
1117                 rcu_read_lock();
1118                 idev = __in6_dev_get(dev);
1119                 if (idev)
1120                         hoplimit = idev->cnf.hop_limit;
1121                 else
1122                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1123                 rcu_read_unlock();
1124         }
1125         return hoplimit;
1126 }
1127 EXPORT_SYMBOL(ip6_dst_hoplimit);
1128
1129 /*
1130  *
1131  */
1132
1133 int ip6_route_add(struct fib6_config *cfg)
1134 {
1135         int err;
1136         struct net *net = cfg->fc_nlinfo.nl_net;
1137         struct rt6_info *rt = NULL;
1138         struct net_device *dev = NULL;
1139         struct inet6_dev *idev = NULL;
1140         struct fib6_table *table;
1141         int addr_type;
1142
1143         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1144                 return -EINVAL;
1145 #ifndef CONFIG_IPV6_SUBTREES
1146         if (cfg->fc_src_len)
1147                 return -EINVAL;
1148 #endif
1149         if (cfg->fc_ifindex) {
1150                 err = -ENODEV;
1151                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1152                 if (!dev)
1153                         goto out;
1154                 idev = in6_dev_get(dev);
1155                 if (!idev)
1156                         goto out;
1157         }
1158
1159         if (cfg->fc_metric == 0)
1160                 cfg->fc_metric = IP6_RT_PRIO_USER;
1161
1162         table = fib6_new_table(net, cfg->fc_table);
1163         if (table == NULL) {
1164                 err = -ENOBUFS;
1165                 goto out;
1166         }
1167
1168         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1169
1170         if (rt == NULL) {
1171                 err = -ENOMEM;
1172                 goto out;
1173         }
1174
1175         rt->dst.obsolete = -1;
1176         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1177                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1178                                 0;
1179
1180         if (cfg->fc_protocol == RTPROT_UNSPEC)
1181                 cfg->fc_protocol = RTPROT_BOOT;
1182         rt->rt6i_protocol = cfg->fc_protocol;
1183
1184         addr_type = ipv6_addr_type(&cfg->fc_dst);
1185
1186         if (addr_type & IPV6_ADDR_MULTICAST)
1187                 rt->dst.input = ip6_mc_input;
1188         else if (cfg->fc_flags & RTF_LOCAL)
1189                 rt->dst.input = ip6_input;
1190         else
1191                 rt->dst.input = ip6_forward;
1192
1193         rt->dst.output = ip6_output;
1194
1195         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1196         rt->rt6i_dst.plen = cfg->fc_dst_len;
1197         if (rt->rt6i_dst.plen == 128)
1198                rt->dst.flags = DST_HOST;
1199
1200 #ifdef CONFIG_IPV6_SUBTREES
1201         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1202         rt->rt6i_src.plen = cfg->fc_src_len;
1203 #endif
1204
1205         rt->rt6i_metric = cfg->fc_metric;
1206
1207         /* We cannot add true routes via loopback here,
1208            they would result in kernel looping; promote them to reject routes
1209          */
1210         if ((cfg->fc_flags & RTF_REJECT) ||
1211             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1212                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1213                 /* hold loopback dev/idev if we haven't done so. */
1214                 if (dev != net->loopback_dev) {
1215                         if (dev) {
1216                                 dev_put(dev);
1217                                 in6_dev_put(idev);
1218                         }
1219                         dev = net->loopback_dev;
1220                         dev_hold(dev);
1221                         idev = in6_dev_get(dev);
1222                         if (!idev) {
1223                                 err = -ENODEV;
1224                                 goto out;
1225                         }
1226                 }
1227                 rt->dst.output = ip6_pkt_discard_out;
1228                 rt->dst.input = ip6_pkt_discard;
1229                 rt->dst.error = -ENETUNREACH;
1230                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1231                 goto install_route;
1232         }
1233
1234         if (cfg->fc_flags & RTF_GATEWAY) {
1235                 struct in6_addr *gw_addr;
1236                 int gwa_type;
1237
1238                 gw_addr = &cfg->fc_gateway;
1239                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1240                 gwa_type = ipv6_addr_type(gw_addr);
1241
1242                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1243                         struct rt6_info *grt;
1244
1245                         /* IPv6 strictly inhibits using not link-local
1246                            addresses as nexthop address.
1247                            Otherwise, router will not able to send redirects.
1248                            It is very good, but in some (rare!) circumstances
1249                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1250                            some exceptions. --ANK
1251                          */
1252                         err = -EINVAL;
1253                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1254                                 goto out;
1255
1256                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1257
1258                         err = -EHOSTUNREACH;
1259                         if (grt == NULL)
1260                                 goto out;
1261                         if (dev) {
1262                                 if (dev != grt->rt6i_dev) {
1263                                         dst_release(&grt->dst);
1264                                         goto out;
1265                                 }
1266                         } else {
1267                                 dev = grt->rt6i_dev;
1268                                 idev = grt->rt6i_idev;
1269                                 dev_hold(dev);
1270                                 in6_dev_hold(grt->rt6i_idev);
1271                         }
1272                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1273                                 err = 0;
1274                         dst_release(&grt->dst);
1275
1276                         if (err)
1277                                 goto out;
1278                 }
1279                 err = -EINVAL;
1280                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1281                         goto out;
1282         }
1283
1284         err = -ENODEV;
1285         if (dev == NULL)
1286                 goto out;
1287
1288         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1289                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1290                 if (IS_ERR(rt->rt6i_nexthop)) {
1291                         err = PTR_ERR(rt->rt6i_nexthop);
1292                         rt->rt6i_nexthop = NULL;
1293                         goto out;
1294                 }
1295         }
1296
1297         rt->rt6i_flags = cfg->fc_flags;
1298
1299 install_route:
1300         if (cfg->fc_mx) {
1301                 struct nlattr *nla;
1302                 int remaining;
1303
1304                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1305                         int type = nla_type(nla);
1306
1307                         if (type) {
1308                                 if (type > RTAX_MAX) {
1309                                         err = -EINVAL;
1310                                         goto out;
1311                                 }
1312
1313                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1314                         }
1315                 }
1316         }
1317
1318         if (!dst_mtu(&rt->dst))
1319                 dst_metric_set(&rt->dst, RTAX_MTU, ipv6_get_mtu(dev));
1320         rt->dst.dev = dev;
1321         rt->rt6i_idev = idev;
1322         rt->rt6i_table = table;
1323
1324         cfg->fc_nlinfo.nl_net = dev_net(dev);
1325
1326         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1327
1328 out:
1329         if (dev)
1330                 dev_put(dev);
1331         if (idev)
1332                 in6_dev_put(idev);
1333         if (rt)
1334                 dst_free(&rt->dst);
1335         return err;
1336 }
1337
1338 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1339 {
1340         int err;
1341         struct fib6_table *table;
1342         struct net *net = dev_net(rt->rt6i_dev);
1343
1344         if (rt == net->ipv6.ip6_null_entry)
1345                 return -ENOENT;
1346
1347         table = rt->rt6i_table;
1348         write_lock_bh(&table->tb6_lock);
1349
1350         err = fib6_del(rt, info);
1351         dst_release(&rt->dst);
1352
1353         write_unlock_bh(&table->tb6_lock);
1354
1355         return err;
1356 }
1357
1358 int ip6_del_rt(struct rt6_info *rt)
1359 {
1360         struct nl_info info = {
1361                 .nl_net = dev_net(rt->rt6i_dev),
1362         };
1363         return __ip6_del_rt(rt, &info);
1364 }
1365
1366 static int ip6_route_del(struct fib6_config *cfg)
1367 {
1368         struct fib6_table *table;
1369         struct fib6_node *fn;
1370         struct rt6_info *rt;
1371         int err = -ESRCH;
1372
1373         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1374         if (table == NULL)
1375                 return err;
1376
1377         read_lock_bh(&table->tb6_lock);
1378
1379         fn = fib6_locate(&table->tb6_root,
1380                          &cfg->fc_dst, cfg->fc_dst_len,
1381                          &cfg->fc_src, cfg->fc_src_len);
1382
1383         if (fn) {
1384                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1385                         if (cfg->fc_ifindex &&
1386                             (rt->rt6i_dev == NULL ||
1387                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1388                                 continue;
1389                         if (cfg->fc_flags & RTF_GATEWAY &&
1390                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1391                                 continue;
1392                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1393                                 continue;
1394                         dst_hold(&rt->dst);
1395                         read_unlock_bh(&table->tb6_lock);
1396
1397                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1398                 }
1399         }
1400         read_unlock_bh(&table->tb6_lock);
1401
1402         return err;
1403 }
1404
1405 /*
1406  *      Handle redirects
1407  */
1408 struct ip6rd_flowi {
1409         struct flowi fl;
1410         struct in6_addr gateway;
1411 };
1412
1413 static struct rt6_info *__ip6_route_redirect(struct net *net,
1414                                              struct fib6_table *table,
1415                                              struct flowi *fl,
1416                                              int flags)
1417 {
1418         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1419         struct rt6_info *rt;
1420         struct fib6_node *fn;
1421
1422         /*
1423          * Get the "current" route for this destination and
1424          * check if the redirect has come from approriate router.
1425          *
1426          * RFC 2461 specifies that redirects should only be
1427          * accepted if they come from the nexthop to the target.
1428          * Due to the way the routes are chosen, this notion
1429          * is a bit fuzzy and one might need to check all possible
1430          * routes.
1431          */
1432
1433         read_lock_bh(&table->tb6_lock);
1434         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1435 restart:
1436         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1437                 /*
1438                  * Current route is on-link; redirect is always invalid.
1439                  *
1440                  * Seems, previous statement is not true. It could
1441                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1442                  * But then router serving it might decide, that we should
1443                  * know truth 8)8) --ANK (980726).
1444                  */
1445                 if (rt6_check_expired(rt))
1446                         continue;
1447                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1448                         continue;
1449                 if (fl->oif != rt->rt6i_dev->ifindex)
1450                         continue;
1451                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1452                         continue;
1453                 break;
1454         }
1455
1456         if (!rt)
1457                 rt = net->ipv6.ip6_null_entry;
1458         BACKTRACK(net, &fl->fl6_src);
1459 out:
1460         dst_hold(&rt->dst);
1461
1462         read_unlock_bh(&table->tb6_lock);
1463
1464         return rt;
1465 };
1466
1467 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1468                                            struct in6_addr *src,
1469                                            struct in6_addr *gateway,
1470                                            struct net_device *dev)
1471 {
1472         int flags = RT6_LOOKUP_F_HAS_SADDR;
1473         struct net *net = dev_net(dev);
1474         struct ip6rd_flowi rdfl = {
1475                 .fl = {
1476                         .oif = dev->ifindex,
1477                         .fl6_dst = *dest,
1478                         .fl6_src = *src,
1479                 },
1480         };
1481
1482         ipv6_addr_copy(&rdfl.gateway, gateway);
1483
1484         if (rt6_need_strict(dest))
1485                 flags |= RT6_LOOKUP_F_IFACE;
1486
1487         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1488                                                    flags, __ip6_route_redirect);
1489 }
1490
1491 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1492                   struct in6_addr *saddr,
1493                   struct neighbour *neigh, u8 *lladdr, int on_link)
1494 {
1495         struct rt6_info *rt, *nrt = NULL;
1496         struct netevent_redirect netevent;
1497         struct net *net = dev_net(neigh->dev);
1498
1499         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1500
1501         if (rt == net->ipv6.ip6_null_entry) {
1502                 if (net_ratelimit())
1503                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1504                                "for redirect target\n");
1505                 goto out;
1506         }
1507
1508         /*
1509          *      We have finally decided to accept it.
1510          */
1511
1512         neigh_update(neigh, lladdr, NUD_STALE,
1513                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1514                      NEIGH_UPDATE_F_OVERRIDE|
1515                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1516                                      NEIGH_UPDATE_F_ISROUTER))
1517                      );
1518
1519         /*
1520          * Redirect received -> path was valid.
1521          * Look, redirects are sent only in response to data packets,
1522          * so that this nexthop apparently is reachable. --ANK
1523          */
1524         dst_confirm(&rt->dst);
1525
1526         /* Duplicate redirect: silently ignore. */
1527         if (neigh == rt->dst.neighbour)
1528                 goto out;
1529
1530         nrt = ip6_rt_copy(rt);
1531         if (nrt == NULL)
1532                 goto out;
1533
1534         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1535         if (on_link)
1536                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1537
1538         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1539         nrt->rt6i_dst.plen = 128;
1540         nrt->dst.flags |= DST_HOST;
1541
1542         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1543         nrt->rt6i_nexthop = neigh_clone(neigh);
1544         /* Reset pmtu, it may be better */
1545         dst_metric_set(&nrt->dst, RTAX_MTU, ipv6_get_mtu(neigh->dev));
1546
1547         if (ip6_ins_rt(nrt))
1548                 goto out;
1549
1550         netevent.old = &rt->dst;
1551         netevent.new = &nrt->dst;
1552         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1553
1554         if (rt->rt6i_flags&RTF_CACHE) {
1555                 ip6_del_rt(rt);
1556                 return;
1557         }
1558
1559 out:
1560         dst_release(&rt->dst);
1561 }
1562
1563 /*
1564  *      Handle ICMP "packet too big" messages
1565  *      i.e. Path MTU discovery
1566  */
1567
1568 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1569                              struct net *net, u32 pmtu, int ifindex)
1570 {
1571         struct rt6_info *rt, *nrt;
1572         int allfrag = 0;
1573
1574         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1575         if (rt == NULL)
1576                 return;
1577
1578         if (pmtu >= dst_mtu(&rt->dst))
1579                 goto out;
1580
1581         if (pmtu < IPV6_MIN_MTU) {
1582                 /*
1583                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1584                  * MTU (1280) and a fragment header should always be included
1585                  * after a node receiving Too Big message reporting PMTU is
1586                  * less than the IPv6 Minimum Link MTU.
1587                  */
1588                 pmtu = IPV6_MIN_MTU;
1589                 allfrag = 1;
1590         }
1591
1592         /* New mtu received -> path was valid.
1593            They are sent only in response to data packets,
1594            so that this nexthop apparently is reachable. --ANK
1595          */
1596         dst_confirm(&rt->dst);
1597
1598         /* Host route. If it is static, it would be better
1599            not to override it, but add new one, so that
1600            when cache entry will expire old pmtu
1601            would return automatically.
1602          */
1603         if (rt->rt6i_flags & RTF_CACHE) {
1604                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1605                 if (allfrag) {
1606                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1607                         features |= RTAX_FEATURE_ALLFRAG;
1608                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1609                 }
1610                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1611                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1612                 goto out;
1613         }
1614
1615         /* Network route.
1616            Two cases are possible:
1617            1. It is connected route. Action: COW
1618            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1619          */
1620         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1621                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1622         else
1623                 nrt = rt6_alloc_clone(rt, daddr);
1624
1625         if (nrt) {
1626                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1627                 if (allfrag) {
1628                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1629                         features |= RTAX_FEATURE_ALLFRAG;
1630                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1631                 }
1632
1633                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1634                  * happened within 5 mins, the recommended timer is 10 mins.
1635                  * Here this route expiration time is set to ip6_rt_mtu_expires
1636                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1637                  * and detecting PMTU increase will be automatically happened.
1638                  */
1639                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1640                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1641
1642                 ip6_ins_rt(nrt);
1643         }
1644 out:
1645         dst_release(&rt->dst);
1646 }
1647
1648 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1649                         struct net_device *dev, u32 pmtu)
1650 {
1651         struct net *net = dev_net(dev);
1652
1653         /*
1654          * RFC 1981 states that a node "MUST reduce the size of the packets it
1655          * is sending along the path" that caused the Packet Too Big message.
1656          * Since it's not possible in the general case to determine which
1657          * interface was used to send the original packet, we update the MTU
1658          * on the interface that will be used to send future packets. We also
1659          * update the MTU on the interface that received the Packet Too Big in
1660          * case the original packet was forced out that interface with
1661          * SO_BINDTODEVICE or similar. This is the next best thing to the
1662          * correct behaviour, which would be to update the MTU on all
1663          * interfaces.
1664          */
1665         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1666         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1667 }
1668
1669 /*
1670  *      Misc support functions
1671  */
1672
1673 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1674 {
1675         struct net *net = dev_net(ort->rt6i_dev);
1676         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1677
1678         if (rt) {
1679                 rt->dst.input = ort->dst.input;
1680                 rt->dst.output = ort->dst.output;
1681
1682                 dst_copy_metrics(&rt->dst, &ort->dst);
1683                 rt->dst.error = ort->dst.error;
1684                 rt->dst.dev = ort->dst.dev;
1685                 if (rt->dst.dev)
1686                         dev_hold(rt->dst.dev);
1687                 rt->rt6i_idev = ort->rt6i_idev;
1688                 if (rt->rt6i_idev)
1689                         in6_dev_hold(rt->rt6i_idev);
1690                 rt->dst.lastuse = jiffies;
1691                 rt->rt6i_expires = 0;
1692
1693                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1694                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1695                 rt->rt6i_metric = 0;
1696
1697                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1698 #ifdef CONFIG_IPV6_SUBTREES
1699                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1700 #endif
1701                 rt->rt6i_table = ort->rt6i_table;
1702         }
1703         return rt;
1704 }
1705
1706 #ifdef CONFIG_IPV6_ROUTE_INFO
1707 static struct rt6_info *rt6_get_route_info(struct net *net,
1708                                            struct in6_addr *prefix, int prefixlen,
1709                                            struct in6_addr *gwaddr, int ifindex)
1710 {
1711         struct fib6_node *fn;
1712         struct rt6_info *rt = NULL;
1713         struct fib6_table *table;
1714
1715         table = fib6_get_table(net, RT6_TABLE_INFO);
1716         if (table == NULL)
1717                 return NULL;
1718
1719         write_lock_bh(&table->tb6_lock);
1720         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1721         if (!fn)
1722                 goto out;
1723
1724         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1725                 if (rt->rt6i_dev->ifindex != ifindex)
1726                         continue;
1727                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1728                         continue;
1729                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1730                         continue;
1731                 dst_hold(&rt->dst);
1732                 break;
1733         }
1734 out:
1735         write_unlock_bh(&table->tb6_lock);
1736         return rt;
1737 }
1738
1739 static struct rt6_info *rt6_add_route_info(struct net *net,
1740                                            struct in6_addr *prefix, int prefixlen,
1741                                            struct in6_addr *gwaddr, int ifindex,
1742                                            unsigned pref)
1743 {
1744         struct fib6_config cfg = {
1745                 .fc_table       = RT6_TABLE_INFO,
1746                 .fc_metric      = IP6_RT_PRIO_USER,
1747                 .fc_ifindex     = ifindex,
1748                 .fc_dst_len     = prefixlen,
1749                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1750                                   RTF_UP | RTF_PREF(pref),
1751                 .fc_nlinfo.pid = 0,
1752                 .fc_nlinfo.nlh = NULL,
1753                 .fc_nlinfo.nl_net = net,
1754         };
1755
1756         ipv6_addr_copy(&cfg.fc_dst, prefix);
1757         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1758
1759         /* We should treat it as a default route if prefix length is 0. */
1760         if (!prefixlen)
1761                 cfg.fc_flags |= RTF_DEFAULT;
1762
1763         ip6_route_add(&cfg);
1764
1765         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1766 }
1767 #endif
1768
1769 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1770 {
1771         struct rt6_info *rt;
1772         struct fib6_table *table;
1773
1774         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1775         if (table == NULL)
1776                 return NULL;
1777
1778         write_lock_bh(&table->tb6_lock);
1779         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1780                 if (dev == rt->rt6i_dev &&
1781                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1782                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1783                         break;
1784         }
1785         if (rt)
1786                 dst_hold(&rt->dst);
1787         write_unlock_bh(&table->tb6_lock);
1788         return rt;
1789 }
1790
1791 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1792                                      struct net_device *dev,
1793                                      unsigned int pref)
1794 {
1795         struct fib6_config cfg = {
1796                 .fc_table       = RT6_TABLE_DFLT,
1797                 .fc_metric      = IP6_RT_PRIO_USER,
1798                 .fc_ifindex     = dev->ifindex,
1799                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1800                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1801                 .fc_nlinfo.pid = 0,
1802                 .fc_nlinfo.nlh = NULL,
1803                 .fc_nlinfo.nl_net = dev_net(dev),
1804         };
1805
1806         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1807
1808         ip6_route_add(&cfg);
1809
1810         return rt6_get_dflt_router(gwaddr, dev);
1811 }
1812
1813 void rt6_purge_dflt_routers(struct net *net)
1814 {
1815         struct rt6_info *rt;
1816         struct fib6_table *table;
1817
1818         /* NOTE: Keep consistent with rt6_get_dflt_router */
1819         table = fib6_get_table(net, RT6_TABLE_DFLT);
1820         if (table == NULL)
1821                 return;
1822
1823 restart:
1824         read_lock_bh(&table->tb6_lock);
1825         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1826                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1827                         dst_hold(&rt->dst);
1828                         read_unlock_bh(&table->tb6_lock);
1829                         ip6_del_rt(rt);
1830                         goto restart;
1831                 }
1832         }
1833         read_unlock_bh(&table->tb6_lock);
1834 }
1835
1836 static void rtmsg_to_fib6_config(struct net *net,
1837                                  struct in6_rtmsg *rtmsg,
1838                                  struct fib6_config *cfg)
1839 {
1840         memset(cfg, 0, sizeof(*cfg));
1841
1842         cfg->fc_table = RT6_TABLE_MAIN;
1843         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1844         cfg->fc_metric = rtmsg->rtmsg_metric;
1845         cfg->fc_expires = rtmsg->rtmsg_info;
1846         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1847         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1848         cfg->fc_flags = rtmsg->rtmsg_flags;
1849
1850         cfg->fc_nlinfo.nl_net = net;
1851
1852         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1853         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1854         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1855 }
1856
1857 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1858 {
1859         struct fib6_config cfg;
1860         struct in6_rtmsg rtmsg;
1861         int err;
1862
1863         switch(cmd) {
1864         case SIOCADDRT:         /* Add a route */
1865         case SIOCDELRT:         /* Delete a route */
1866                 if (!capable(CAP_NET_ADMIN))
1867                         return -EPERM;
1868                 err = copy_from_user(&rtmsg, arg,
1869                                      sizeof(struct in6_rtmsg));
1870                 if (err)
1871                         return -EFAULT;
1872
1873                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1874
1875                 rtnl_lock();
1876                 switch (cmd) {
1877                 case SIOCADDRT:
1878                         err = ip6_route_add(&cfg);
1879                         break;
1880                 case SIOCDELRT:
1881                         err = ip6_route_del(&cfg);
1882                         break;
1883                 default:
1884                         err = -EINVAL;
1885                 }
1886                 rtnl_unlock();
1887
1888                 return err;
1889         }
1890
1891         return -EINVAL;
1892 }
1893
1894 /*
1895  *      Drop the packet on the floor
1896  */
1897
1898 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1899 {
1900         int type;
1901         struct dst_entry *dst = skb_dst(skb);
1902         switch (ipstats_mib_noroutes) {
1903         case IPSTATS_MIB_INNOROUTES:
1904                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1905                 if (type == IPV6_ADDR_ANY) {
1906                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1907                                       IPSTATS_MIB_INADDRERRORS);
1908                         break;
1909                 }
1910                 /* FALLTHROUGH */
1911         case IPSTATS_MIB_OUTNOROUTES:
1912                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1913                               ipstats_mib_noroutes);
1914                 break;
1915         }
1916         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1917         kfree_skb(skb);
1918         return 0;
1919 }
1920
1921 static int ip6_pkt_discard(struct sk_buff *skb)
1922 {
1923         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1924 }
1925
1926 static int ip6_pkt_discard_out(struct sk_buff *skb)
1927 {
1928         skb->dev = skb_dst(skb)->dev;
1929         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1930 }
1931
1932 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1933
1934 static int ip6_pkt_prohibit(struct sk_buff *skb)
1935 {
1936         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1937 }
1938
1939 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1940 {
1941         skb->dev = skb_dst(skb)->dev;
1942         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1943 }
1944
1945 #endif
1946
1947 /*
1948  *      Allocate a dst for local (unicast / anycast) address.
1949  */
1950
1951 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1952                                     const struct in6_addr *addr,
1953                                     int anycast)
1954 {
1955         struct net *net = dev_net(idev->dev);
1956         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1957         struct neighbour *neigh;
1958
1959         if (rt == NULL) {
1960                 if (net_ratelimit())
1961                         pr_warning("IPv6:  Maximum number of routes reached,"
1962                                    " consider increasing route/max_size.\n");
1963                 return ERR_PTR(-ENOMEM);
1964         }
1965
1966         dev_hold(net->loopback_dev);
1967         in6_dev_hold(idev);
1968
1969         rt->dst.flags = DST_HOST;
1970         rt->dst.input = ip6_input;
1971         rt->dst.output = ip6_output;
1972         rt->rt6i_dev = net->loopback_dev;
1973         rt->rt6i_idev = idev;
1974         dst_metric_set(&rt->dst, RTAX_MTU, ipv6_get_mtu(rt->rt6i_dev));
1975         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
1976         rt->dst.obsolete = -1;
1977
1978         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1979         if (anycast)
1980                 rt->rt6i_flags |= RTF_ANYCAST;
1981         else
1982                 rt->rt6i_flags |= RTF_LOCAL;
1983         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1984         if (IS_ERR(neigh)) {
1985                 dst_free(&rt->dst);
1986
1987                 /* We are casting this because that is the return
1988                  * value type.  But an errno encoded pointer is the
1989                  * same regardless of the underlying pointer type,
1990                  * and that's what we are returning.  So this is OK.
1991                  */
1992                 return (struct rt6_info *) neigh;
1993         }
1994         rt->rt6i_nexthop = neigh;
1995
1996         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1997         rt->rt6i_dst.plen = 128;
1998         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1999
2000         atomic_set(&rt->dst.__refcnt, 1);
2001
2002         return rt;
2003 }
2004
2005 struct arg_dev_net {
2006         struct net_device *dev;
2007         struct net *net;
2008 };
2009
2010 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2011 {
2012         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
2013         struct net *net = ((struct arg_dev_net *)arg)->net;
2014
2015         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2016             rt != net->ipv6.ip6_null_entry) {
2017                 RT6_TRACE("deleted by ifdown %p\n", rt);
2018                 return -1;
2019         }
2020         return 0;
2021 }
2022
2023 void rt6_ifdown(struct net *net, struct net_device *dev)
2024 {
2025         struct arg_dev_net adn = {
2026                 .dev = dev,
2027                 .net = net,
2028         };
2029
2030         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2031         icmp6_clean_all(fib6_ifdown, &adn);
2032 }
2033
2034 struct rt6_mtu_change_arg
2035 {
2036         struct net_device *dev;
2037         unsigned mtu;
2038 };
2039
2040 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2041 {
2042         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2043         struct inet6_dev *idev;
2044
2045         /* In IPv6 pmtu discovery is not optional,
2046            so that RTAX_MTU lock cannot disable it.
2047            We still use this lock to block changes
2048            caused by addrconf/ndisc.
2049         */
2050
2051         idev = __in6_dev_get(arg->dev);
2052         if (idev == NULL)
2053                 return 0;
2054
2055         /* For administrative MTU increase, there is no way to discover
2056            IPv6 PMTU increase, so PMTU increase should be updated here.
2057            Since RFC 1981 doesn't include administrative MTU increase
2058            update PMTU increase is a MUST. (i.e. jumbo frame)
2059          */
2060         /*
2061            If new MTU is less than route PMTU, this new MTU will be the
2062            lowest MTU in the path, update the route PMTU to reflect PMTU
2063            decreases; if new MTU is greater than route PMTU, and the
2064            old MTU is the lowest MTU in the path, update the route PMTU
2065            to reflect the increase. In this case if the other nodes' MTU
2066            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2067            PMTU discouvery.
2068          */
2069         if (rt->rt6i_dev == arg->dev &&
2070             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2071             (dst_mtu(&rt->dst) >= arg->mtu ||
2072              (dst_mtu(&rt->dst) < arg->mtu &&
2073               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2074                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2075         }
2076         return 0;
2077 }
2078
2079 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2080 {
2081         struct rt6_mtu_change_arg arg = {
2082                 .dev = dev,
2083                 .mtu = mtu,
2084         };
2085
2086         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2087 }
2088
2089 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2090         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2091         [RTA_OIF]               = { .type = NLA_U32 },
2092         [RTA_IIF]               = { .type = NLA_U32 },
2093         [RTA_PRIORITY]          = { .type = NLA_U32 },
2094         [RTA_METRICS]           = { .type = NLA_NESTED },
2095 };
2096
2097 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2098                               struct fib6_config *cfg)
2099 {
2100         struct rtmsg *rtm;
2101         struct nlattr *tb[RTA_MAX+1];
2102         int err;
2103
2104         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2105         if (err < 0)
2106                 goto errout;
2107
2108         err = -EINVAL;
2109         rtm = nlmsg_data(nlh);
2110         memset(cfg, 0, sizeof(*cfg));
2111
2112         cfg->fc_table = rtm->rtm_table;
2113         cfg->fc_dst_len = rtm->rtm_dst_len;
2114         cfg->fc_src_len = rtm->rtm_src_len;
2115         cfg->fc_flags = RTF_UP;
2116         cfg->fc_protocol = rtm->rtm_protocol;
2117
2118         if (rtm->rtm_type == RTN_UNREACHABLE)
2119                 cfg->fc_flags |= RTF_REJECT;
2120
2121         if (rtm->rtm_type == RTN_LOCAL)
2122                 cfg->fc_flags |= RTF_LOCAL;
2123
2124         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2125         cfg->fc_nlinfo.nlh = nlh;
2126         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2127
2128         if (tb[RTA_GATEWAY]) {
2129                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2130                 cfg->fc_flags |= RTF_GATEWAY;
2131         }
2132
2133         if (tb[RTA_DST]) {
2134                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2135
2136                 if (nla_len(tb[RTA_DST]) < plen)
2137                         goto errout;
2138
2139                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2140         }
2141
2142         if (tb[RTA_SRC]) {
2143                 int plen = (rtm->rtm_src_len + 7) >> 3;
2144
2145                 if (nla_len(tb[RTA_SRC]) < plen)
2146                         goto errout;
2147
2148                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2149         }
2150
2151         if (tb[RTA_OIF])
2152                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2153
2154         if (tb[RTA_PRIORITY])
2155                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2156
2157         if (tb[RTA_METRICS]) {
2158                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2159                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2160         }
2161
2162         if (tb[RTA_TABLE])
2163                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2164
2165         err = 0;
2166 errout:
2167         return err;
2168 }
2169
2170 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2171 {
2172         struct fib6_config cfg;
2173         int err;
2174
2175         err = rtm_to_fib6_config(skb, nlh, &cfg);
2176         if (err < 0)
2177                 return err;
2178
2179         return ip6_route_del(&cfg);
2180 }
2181
2182 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2183 {
2184         struct fib6_config cfg;
2185         int err;
2186
2187         err = rtm_to_fib6_config(skb, nlh, &cfg);
2188         if (err < 0)
2189                 return err;
2190
2191         return ip6_route_add(&cfg);
2192 }
2193
2194 static inline size_t rt6_nlmsg_size(void)
2195 {
2196         return NLMSG_ALIGN(sizeof(struct rtmsg))
2197                + nla_total_size(16) /* RTA_SRC */
2198                + nla_total_size(16) /* RTA_DST */
2199                + nla_total_size(16) /* RTA_GATEWAY */
2200                + nla_total_size(16) /* RTA_PREFSRC */
2201                + nla_total_size(4) /* RTA_TABLE */
2202                + nla_total_size(4) /* RTA_IIF */
2203                + nla_total_size(4) /* RTA_OIF */
2204                + nla_total_size(4) /* RTA_PRIORITY */
2205                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2206                + nla_total_size(sizeof(struct rta_cacheinfo));
2207 }
2208
2209 static int rt6_fill_node(struct net *net,
2210                          struct sk_buff *skb, struct rt6_info *rt,
2211                          struct in6_addr *dst, struct in6_addr *src,
2212                          int iif, int type, u32 pid, u32 seq,
2213                          int prefix, int nowait, unsigned int flags)
2214 {
2215         struct rtmsg *rtm;
2216         struct nlmsghdr *nlh;
2217         long expires;
2218         u32 table;
2219
2220         if (prefix) {   /* user wants prefix routes only */
2221                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2222                         /* success since this is not a prefix route */
2223                         return 1;
2224                 }
2225         }
2226
2227         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2228         if (nlh == NULL)
2229                 return -EMSGSIZE;
2230
2231         rtm = nlmsg_data(nlh);
2232         rtm->rtm_family = AF_INET6;
2233         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2234         rtm->rtm_src_len = rt->rt6i_src.plen;
2235         rtm->rtm_tos = 0;
2236         if (rt->rt6i_table)
2237                 table = rt->rt6i_table->tb6_id;
2238         else
2239                 table = RT6_TABLE_UNSPEC;
2240         rtm->rtm_table = table;
2241         NLA_PUT_U32(skb, RTA_TABLE, table);
2242         if (rt->rt6i_flags&RTF_REJECT)
2243                 rtm->rtm_type = RTN_UNREACHABLE;
2244         else if (rt->rt6i_flags&RTF_LOCAL)
2245                 rtm->rtm_type = RTN_LOCAL;
2246         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2247                 rtm->rtm_type = RTN_LOCAL;
2248         else
2249                 rtm->rtm_type = RTN_UNICAST;
2250         rtm->rtm_flags = 0;
2251         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2252         rtm->rtm_protocol = rt->rt6i_protocol;
2253         if (rt->rt6i_flags&RTF_DYNAMIC)
2254                 rtm->rtm_protocol = RTPROT_REDIRECT;
2255         else if (rt->rt6i_flags & RTF_ADDRCONF)
2256                 rtm->rtm_protocol = RTPROT_KERNEL;
2257         else if (rt->rt6i_flags&RTF_DEFAULT)
2258                 rtm->rtm_protocol = RTPROT_RA;
2259
2260         if (rt->rt6i_flags&RTF_CACHE)
2261                 rtm->rtm_flags |= RTM_F_CLONED;
2262
2263         if (dst) {
2264                 NLA_PUT(skb, RTA_DST, 16, dst);
2265                 rtm->rtm_dst_len = 128;
2266         } else if (rtm->rtm_dst_len)
2267                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2268 #ifdef CONFIG_IPV6_SUBTREES
2269         if (src) {
2270                 NLA_PUT(skb, RTA_SRC, 16, src);
2271                 rtm->rtm_src_len = 128;
2272         } else if (rtm->rtm_src_len)
2273                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2274 #endif
2275         if (iif) {
2276 #ifdef CONFIG_IPV6_MROUTE
2277                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2278                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2279                         if (err <= 0) {
2280                                 if (!nowait) {
2281                                         if (err == 0)
2282                                                 return 0;
2283                                         goto nla_put_failure;
2284                                 } else {
2285                                         if (err == -EMSGSIZE)
2286                                                 goto nla_put_failure;
2287                                 }
2288                         }
2289                 } else
2290 #endif
2291                         NLA_PUT_U32(skb, RTA_IIF, iif);
2292         } else if (dst) {
2293                 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2294                 struct in6_addr saddr_buf;
2295                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2296                                        dst, 0, &saddr_buf) == 0)
2297                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2298         }
2299
2300         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2301                 goto nla_put_failure;
2302
2303         if (rt->dst.neighbour)
2304                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2305
2306         if (rt->dst.dev)
2307                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2308
2309         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2310
2311         if (!(rt->rt6i_flags & RTF_EXPIRES))
2312                 expires = 0;
2313         else if (rt->rt6i_expires - jiffies < INT_MAX)
2314                 expires = rt->rt6i_expires - jiffies;
2315         else
2316                 expires = INT_MAX;
2317
2318         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2319                                expires, rt->dst.error) < 0)
2320                 goto nla_put_failure;
2321
2322         return nlmsg_end(skb, nlh);
2323
2324 nla_put_failure:
2325         nlmsg_cancel(skb, nlh);
2326         return -EMSGSIZE;
2327 }
2328
2329 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2330 {
2331         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2332         int prefix;
2333
2334         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2335                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2336                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2337         } else
2338                 prefix = 0;
2339
2340         return rt6_fill_node(arg->net,
2341                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2342                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2343                      prefix, 0, NLM_F_MULTI);
2344 }
2345
2346 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2347 {
2348         struct net *net = sock_net(in_skb->sk);
2349         struct nlattr *tb[RTA_MAX+1];
2350         struct rt6_info *rt;
2351         struct sk_buff *skb;
2352         struct rtmsg *rtm;
2353         struct flowi fl;
2354         int err, iif = 0;
2355
2356         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2357         if (err < 0)
2358                 goto errout;
2359
2360         err = -EINVAL;
2361         memset(&fl, 0, sizeof(fl));
2362
2363         if (tb[RTA_SRC]) {
2364                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2365                         goto errout;
2366
2367                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2368         }
2369
2370         if (tb[RTA_DST]) {
2371                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2372                         goto errout;
2373
2374                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2375         }
2376
2377         if (tb[RTA_IIF])
2378                 iif = nla_get_u32(tb[RTA_IIF]);
2379
2380         if (tb[RTA_OIF])
2381                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2382
2383         if (iif) {
2384                 struct net_device *dev;
2385                 dev = __dev_get_by_index(net, iif);
2386                 if (!dev) {
2387                         err = -ENODEV;
2388                         goto errout;
2389                 }
2390         }
2391
2392         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2393         if (skb == NULL) {
2394                 err = -ENOBUFS;
2395                 goto errout;
2396         }
2397
2398         /* Reserve room for dummy headers, this skb can pass
2399            through good chunk of routing engine.
2400          */
2401         skb_reset_mac_header(skb);
2402         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2403
2404         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2405         skb_dst_set(skb, &rt->dst);
2406
2407         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2408                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2409                             nlh->nlmsg_seq, 0, 0, 0);
2410         if (err < 0) {
2411                 kfree_skb(skb);
2412                 goto errout;
2413         }
2414
2415         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2416 errout:
2417         return err;
2418 }
2419
2420 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2421 {
2422         struct sk_buff *skb;
2423         struct net *net = info->nl_net;
2424         u32 seq;
2425         int err;
2426
2427         err = -ENOBUFS;
2428         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2429
2430         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2431         if (skb == NULL)
2432                 goto errout;
2433
2434         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2435                                 event, info->pid, seq, 0, 0, 0);
2436         if (err < 0) {
2437                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2438                 WARN_ON(err == -EMSGSIZE);
2439                 kfree_skb(skb);
2440                 goto errout;
2441         }
2442         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2443                     info->nlh, gfp_any());
2444         return;
2445 errout:
2446         if (err < 0)
2447                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2448 }
2449
2450 static int ip6_route_dev_notify(struct notifier_block *this,
2451                                 unsigned long event, void *data)
2452 {
2453         struct net_device *dev = (struct net_device *)data;
2454         struct net *net = dev_net(dev);
2455
2456         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2457                 net->ipv6.ip6_null_entry->dst.dev = dev;
2458                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2459 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2460                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2461                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2462                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2463                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2464 #endif
2465         }
2466
2467         return NOTIFY_OK;
2468 }
2469
2470 /*
2471  *      /proc
2472  */
2473
2474 #ifdef CONFIG_PROC_FS
2475
2476 struct rt6_proc_arg
2477 {
2478         char *buffer;
2479         int offset;
2480         int length;
2481         int skip;
2482         int len;
2483 };
2484
2485 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2486 {
2487         struct seq_file *m = p_arg;
2488
2489         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2490
2491 #ifdef CONFIG_IPV6_SUBTREES
2492         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2493 #else
2494         seq_puts(m, "00000000000000000000000000000000 00 ");
2495 #endif
2496
2497         if (rt->rt6i_nexthop) {
2498                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2499         } else {
2500                 seq_puts(m, "00000000000000000000000000000000");
2501         }
2502         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2503                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2504                    rt->dst.__use, rt->rt6i_flags,
2505                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2506         return 0;
2507 }
2508
2509 static int ipv6_route_show(struct seq_file *m, void *v)
2510 {
2511         struct net *net = (struct net *)m->private;
2512         fib6_clean_all(net, rt6_info_route, 0, m);
2513         return 0;
2514 }
2515
2516 static int ipv6_route_open(struct inode *inode, struct file *file)
2517 {
2518         return single_open_net(inode, file, ipv6_route_show);
2519 }
2520
2521 static const struct file_operations ipv6_route_proc_fops = {
2522         .owner          = THIS_MODULE,
2523         .open           = ipv6_route_open,
2524         .read           = seq_read,
2525         .llseek         = seq_lseek,
2526         .release        = single_release_net,
2527 };
2528
2529 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2530 {
2531         struct net *net = (struct net *)seq->private;
2532         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2533                    net->ipv6.rt6_stats->fib_nodes,
2534                    net->ipv6.rt6_stats->fib_route_nodes,
2535                    net->ipv6.rt6_stats->fib_rt_alloc,
2536                    net->ipv6.rt6_stats->fib_rt_entries,
2537                    net->ipv6.rt6_stats->fib_rt_cache,
2538                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2539                    net->ipv6.rt6_stats->fib_discarded_routes);
2540
2541         return 0;
2542 }
2543
2544 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2545 {
2546         return single_open_net(inode, file, rt6_stats_seq_show);
2547 }
2548
2549 static const struct file_operations rt6_stats_seq_fops = {
2550         .owner   = THIS_MODULE,
2551         .open    = rt6_stats_seq_open,
2552         .read    = seq_read,
2553         .llseek  = seq_lseek,
2554         .release = single_release_net,
2555 };
2556 #endif  /* CONFIG_PROC_FS */
2557
2558 #ifdef CONFIG_SYSCTL
2559
2560 static
2561 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2562                               void __user *buffer, size_t *lenp, loff_t *ppos)
2563 {
2564         struct net *net = current->nsproxy->net_ns;
2565         int delay = net->ipv6.sysctl.flush_delay;
2566         if (write) {
2567                 proc_dointvec(ctl, write, buffer, lenp, ppos);
2568                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2569                 return 0;
2570         } else
2571                 return -EINVAL;
2572 }
2573
2574 ctl_table ipv6_route_table_template[] = {
2575         {
2576                 .procname       =       "flush",
2577                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2578                 .maxlen         =       sizeof(int),
2579                 .mode           =       0200,
2580                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2581         },
2582         {
2583                 .procname       =       "gc_thresh",
2584                 .data           =       &ip6_dst_ops_template.gc_thresh,
2585                 .maxlen         =       sizeof(int),
2586                 .mode           =       0644,
2587                 .proc_handler   =       proc_dointvec,
2588         },
2589         {
2590                 .procname       =       "max_size",
2591                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2592                 .maxlen         =       sizeof(int),
2593                 .mode           =       0644,
2594                 .proc_handler   =       proc_dointvec,
2595         },
2596         {
2597                 .procname       =       "gc_min_interval",
2598                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2599                 .maxlen         =       sizeof(int),
2600                 .mode           =       0644,
2601                 .proc_handler   =       proc_dointvec_jiffies,
2602         },
2603         {
2604                 .procname       =       "gc_timeout",
2605                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2606                 .maxlen         =       sizeof(int),
2607                 .mode           =       0644,
2608                 .proc_handler   =       proc_dointvec_jiffies,
2609         },
2610         {
2611                 .procname       =       "gc_interval",
2612                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2613                 .maxlen         =       sizeof(int),
2614                 .mode           =       0644,
2615                 .proc_handler   =       proc_dointvec_jiffies,
2616         },
2617         {
2618                 .procname       =       "gc_elasticity",
2619                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2620                 .maxlen         =       sizeof(int),
2621                 .mode           =       0644,
2622                 .proc_handler   =       proc_dointvec,
2623         },
2624         {
2625                 .procname       =       "mtu_expires",
2626                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2627                 .maxlen         =       sizeof(int),
2628                 .mode           =       0644,
2629                 .proc_handler   =       proc_dointvec_jiffies,
2630         },
2631         {
2632                 .procname       =       "min_adv_mss",
2633                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2634                 .maxlen         =       sizeof(int),
2635                 .mode           =       0644,
2636                 .proc_handler   =       proc_dointvec,
2637         },
2638         {
2639                 .procname       =       "gc_min_interval_ms",
2640                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2641                 .maxlen         =       sizeof(int),
2642                 .mode           =       0644,
2643                 .proc_handler   =       proc_dointvec_ms_jiffies,
2644         },
2645         { }
2646 };
2647
2648 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2649 {
2650         struct ctl_table *table;
2651
2652         table = kmemdup(ipv6_route_table_template,
2653                         sizeof(ipv6_route_table_template),
2654                         GFP_KERNEL);
2655
2656         if (table) {
2657                 table[0].data = &net->ipv6.sysctl.flush_delay;
2658                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2659                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2660                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2661                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2662                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2663                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2664                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2665                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2666                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2667         }
2668
2669         return table;
2670 }
2671 #endif
2672
2673 static int __net_init ip6_route_net_init(struct net *net)
2674 {
2675         int ret = -ENOMEM;
2676
2677         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2678                sizeof(net->ipv6.ip6_dst_ops));
2679
2680         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2681                 goto out_ip6_dst_ops;
2682
2683         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2684                                            sizeof(*net->ipv6.ip6_null_entry),
2685                                            GFP_KERNEL);
2686         if (!net->ipv6.ip6_null_entry)
2687                 goto out_ip6_dst_entries;
2688         net->ipv6.ip6_null_entry->dst.path =
2689                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2690         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2691         dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255);
2692
2693 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2694         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2695                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2696                                                GFP_KERNEL);
2697         if (!net->ipv6.ip6_prohibit_entry)
2698                 goto out_ip6_null_entry;
2699         net->ipv6.ip6_prohibit_entry->dst.path =
2700                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2701         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2702         dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255);
2703
2704         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2705                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2706                                                GFP_KERNEL);
2707         if (!net->ipv6.ip6_blk_hole_entry)
2708                 goto out_ip6_prohibit_entry;
2709         net->ipv6.ip6_blk_hole_entry->dst.path =
2710                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2711         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2712         dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255);
2713 #endif
2714
2715         net->ipv6.sysctl.flush_delay = 0;
2716         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2717         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2718         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2719         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2720         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2721         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2722         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2723
2724 #ifdef CONFIG_PROC_FS
2725         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2726         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2727 #endif
2728         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2729
2730         ret = 0;
2731 out:
2732         return ret;
2733
2734 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2735 out_ip6_prohibit_entry:
2736         kfree(net->ipv6.ip6_prohibit_entry);
2737 out_ip6_null_entry:
2738         kfree(net->ipv6.ip6_null_entry);
2739 #endif
2740 out_ip6_dst_entries:
2741         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2742 out_ip6_dst_ops:
2743         goto out;
2744 }
2745
2746 static void __net_exit ip6_route_net_exit(struct net *net)
2747 {
2748 #ifdef CONFIG_PROC_FS
2749         proc_net_remove(net, "ipv6_route");
2750         proc_net_remove(net, "rt6_stats");
2751 #endif
2752         kfree(net->ipv6.ip6_null_entry);
2753 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2754         kfree(net->ipv6.ip6_prohibit_entry);
2755         kfree(net->ipv6.ip6_blk_hole_entry);
2756 #endif
2757         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2758 }
2759
2760 static struct pernet_operations ip6_route_net_ops = {
2761         .init = ip6_route_net_init,
2762         .exit = ip6_route_net_exit,
2763 };
2764
2765 static struct notifier_block ip6_route_dev_notifier = {
2766         .notifier_call = ip6_route_dev_notify,
2767         .priority = 0,
2768 };
2769
2770 int __init ip6_route_init(void)
2771 {
2772         int ret;
2773
2774         ret = -ENOMEM;
2775         ip6_dst_ops_template.kmem_cachep =
2776                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2777                                   SLAB_HWCACHE_ALIGN, NULL);
2778         if (!ip6_dst_ops_template.kmem_cachep)
2779                 goto out;
2780
2781         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2782         if (ret)
2783                 goto out_kmem_cache;
2784
2785         ret = register_pernet_subsys(&ip6_route_net_ops);
2786         if (ret)
2787                 goto out_dst_entries;
2788
2789         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2790
2791         /* Registering of the loopback is done before this portion of code,
2792          * the loopback reference in rt6_info will not be taken, do it
2793          * manually for init_net */
2794         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2795         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2796   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2797         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2798         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2799         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2800         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2801   #endif
2802         ret = fib6_init();
2803         if (ret)
2804                 goto out_register_subsys;
2805
2806         ret = xfrm6_init();
2807         if (ret)
2808                 goto out_fib6_init;
2809
2810         ret = fib6_rules_init();
2811         if (ret)
2812                 goto xfrm6_init;
2813
2814         ret = -ENOBUFS;
2815         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2816             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2817             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2818                 goto fib6_rules_init;
2819
2820         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2821         if (ret)
2822                 goto fib6_rules_init;
2823
2824 out:
2825         return ret;
2826
2827 fib6_rules_init:
2828         fib6_rules_cleanup();
2829 xfrm6_init:
2830         xfrm6_fini();
2831 out_fib6_init:
2832         fib6_gc_cleanup();
2833 out_register_subsys:
2834         unregister_pernet_subsys(&ip6_route_net_ops);
2835 out_dst_entries:
2836         dst_entries_destroy(&ip6_dst_blackhole_ops);
2837 out_kmem_cache:
2838         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2839         goto out;
2840 }
2841
2842 void ip6_route_cleanup(void)
2843 {
2844         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2845         fib6_rules_cleanup();
2846         xfrm6_fini();
2847         fib6_gc_cleanup();
2848         unregister_pernet_subsys(&ip6_route_net_ops);
2849         dst_entries_destroy(&ip6_dst_blackhole_ops);
2850         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2851 }