]> git.karo-electronics.de Git - mv-sheeva.git/blob - net/ipv6/route.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi-rc-fixes-2.6
[mv-sheeva.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static struct dst_ops ip6_dst_ops_template = {
101         .family                 =       AF_INET6,
102         .protocol               =       cpu_to_be16(ETH_P_IPV6),
103         .gc                     =       ip6_dst_gc,
104         .gc_thresh              =       1024,
105         .check                  =       ip6_dst_check,
106         .default_advmss         =       ip6_default_advmss,
107         .default_mtu            =       ip6_default_mtu,
108         .destroy                =       ip6_dst_destroy,
109         .ifdown                 =       ip6_dst_ifdown,
110         .negative_advice        =       ip6_negative_advice,
111         .link_failure           =       ip6_link_failure,
112         .update_pmtu            =       ip6_rt_update_pmtu,
113         .local_out              =       __ip6_local_out,
114 };
115
116 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
117 {
118 }
119
120 static struct dst_ops ip6_dst_blackhole_ops = {
121         .family                 =       AF_INET6,
122         .protocol               =       cpu_to_be16(ETH_P_IPV6),
123         .destroy                =       ip6_dst_destroy,
124         .check                  =       ip6_dst_check,
125         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
126 };
127
128 static struct rt6_info ip6_null_entry_template = {
129         .dst = {
130                 .__refcnt       = ATOMIC_INIT(1),
131                 .__use          = 1,
132                 .obsolete       = -1,
133                 .error          = -ENETUNREACH,
134                 .input          = ip6_pkt_discard,
135                 .output         = ip6_pkt_discard_out,
136         },
137         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
138         .rt6i_protocol  = RTPROT_KERNEL,
139         .rt6i_metric    = ~(u32) 0,
140         .rt6i_ref       = ATOMIC_INIT(1),
141 };
142
143 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
144
145 static int ip6_pkt_prohibit(struct sk_buff *skb);
146 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
147
148 static struct rt6_info ip6_prohibit_entry_template = {
149         .dst = {
150                 .__refcnt       = ATOMIC_INIT(1),
151                 .__use          = 1,
152                 .obsolete       = -1,
153                 .error          = -EACCES,
154                 .input          = ip6_pkt_prohibit,
155                 .output         = ip6_pkt_prohibit_out,
156         },
157         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
158         .rt6i_protocol  = RTPROT_KERNEL,
159         .rt6i_metric    = ~(u32) 0,
160         .rt6i_ref       = ATOMIC_INIT(1),
161 };
162
163 static struct rt6_info ip6_blk_hole_entry_template = {
164         .dst = {
165                 .__refcnt       = ATOMIC_INIT(1),
166                 .__use          = 1,
167                 .obsolete       = -1,
168                 .error          = -EINVAL,
169                 .input          = dst_discard,
170                 .output         = dst_discard,
171         },
172         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
173         .rt6i_protocol  = RTPROT_KERNEL,
174         .rt6i_metric    = ~(u32) 0,
175         .rt6i_ref       = ATOMIC_INIT(1),
176 };
177
178 #endif
179
180 /* allocate dst with ip6_dst_ops */
181 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
182 {
183         return (struct rt6_info *)dst_alloc(ops);
184 }
185
186 static void ip6_dst_destroy(struct dst_entry *dst)
187 {
188         struct rt6_info *rt = (struct rt6_info *)dst;
189         struct inet6_dev *idev = rt->rt6i_idev;
190         struct inet_peer *peer = rt->rt6i_peer;
191
192         if (idev != NULL) {
193                 rt->rt6i_idev = NULL;
194                 in6_dev_put(idev);
195         }
196         if (peer) {
197                 BUG_ON(!(rt->rt6i_flags & RTF_CACHE));
198                 rt->rt6i_peer = NULL;
199                 inet_putpeer(peer);
200         }
201 }
202
203 void rt6_bind_peer(struct rt6_info *rt, int create)
204 {
205         struct inet_peer *peer;
206
207         if (WARN_ON(!(rt->rt6i_flags & RTF_CACHE)))
208                 return;
209
210         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
211         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
212                 inet_putpeer(peer);
213 }
214
215 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
216                            int how)
217 {
218         struct rt6_info *rt = (struct rt6_info *)dst;
219         struct inet6_dev *idev = rt->rt6i_idev;
220         struct net_device *loopback_dev =
221                 dev_net(dev)->loopback_dev;
222
223         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
224                 struct inet6_dev *loopback_idev =
225                         in6_dev_get(loopback_dev);
226                 if (loopback_idev != NULL) {
227                         rt->rt6i_idev = loopback_idev;
228                         in6_dev_put(idev);
229                 }
230         }
231 }
232
233 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
234 {
235         return (rt->rt6i_flags & RTF_EXPIRES) &&
236                 time_after(jiffies, rt->rt6i_expires);
237 }
238
239 static inline int rt6_need_strict(struct in6_addr *daddr)
240 {
241         return ipv6_addr_type(daddr) &
242                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
243 }
244
245 /*
246  *      Route lookup. Any table->tb6_lock is implied.
247  */
248
249 static inline struct rt6_info *rt6_device_match(struct net *net,
250                                                     struct rt6_info *rt,
251                                                     struct in6_addr *saddr,
252                                                     int oif,
253                                                     int flags)
254 {
255         struct rt6_info *local = NULL;
256         struct rt6_info *sprt;
257
258         if (!oif && ipv6_addr_any(saddr))
259                 goto out;
260
261         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
262                 struct net_device *dev = sprt->rt6i_dev;
263
264                 if (oif) {
265                         if (dev->ifindex == oif)
266                                 return sprt;
267                         if (dev->flags & IFF_LOOPBACK) {
268                                 if (sprt->rt6i_idev == NULL ||
269                                     sprt->rt6i_idev->dev->ifindex != oif) {
270                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
271                                                 continue;
272                                         if (local && (!oif ||
273                                                       local->rt6i_idev->dev->ifindex == oif))
274                                                 continue;
275                                 }
276                                 local = sprt;
277                         }
278                 } else {
279                         if (ipv6_chk_addr(net, saddr, dev,
280                                           flags & RT6_LOOKUP_F_IFACE))
281                                 return sprt;
282                 }
283         }
284
285         if (oif) {
286                 if (local)
287                         return local;
288
289                 if (flags & RT6_LOOKUP_F_IFACE)
290                         return net->ipv6.ip6_null_entry;
291         }
292 out:
293         return rt;
294 }
295
296 #ifdef CONFIG_IPV6_ROUTER_PREF
297 static void rt6_probe(struct rt6_info *rt)
298 {
299         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
300         /*
301          * Okay, this does not seem to be appropriate
302          * for now, however, we need to check if it
303          * is really so; aka Router Reachability Probing.
304          *
305          * Router Reachability Probe MUST be rate-limited
306          * to no more than one per minute.
307          */
308         if (!neigh || (neigh->nud_state & NUD_VALID))
309                 return;
310         read_lock_bh(&neigh->lock);
311         if (!(neigh->nud_state & NUD_VALID) &&
312             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
313                 struct in6_addr mcaddr;
314                 struct in6_addr *target;
315
316                 neigh->updated = jiffies;
317                 read_unlock_bh(&neigh->lock);
318
319                 target = (struct in6_addr *)&neigh->primary_key;
320                 addrconf_addr_solict_mult(target, &mcaddr);
321                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
322         } else
323                 read_unlock_bh(&neigh->lock);
324 }
325 #else
326 static inline void rt6_probe(struct rt6_info *rt)
327 {
328 }
329 #endif
330
331 /*
332  * Default Router Selection (RFC 2461 6.3.6)
333  */
334 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
335 {
336         struct net_device *dev = rt->rt6i_dev;
337         if (!oif || dev->ifindex == oif)
338                 return 2;
339         if ((dev->flags & IFF_LOOPBACK) &&
340             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
341                 return 1;
342         return 0;
343 }
344
345 static inline int rt6_check_neigh(struct rt6_info *rt)
346 {
347         struct neighbour *neigh = rt->rt6i_nexthop;
348         int m;
349         if (rt->rt6i_flags & RTF_NONEXTHOP ||
350             !(rt->rt6i_flags & RTF_GATEWAY))
351                 m = 1;
352         else if (neigh) {
353                 read_lock_bh(&neigh->lock);
354                 if (neigh->nud_state & NUD_VALID)
355                         m = 2;
356 #ifdef CONFIG_IPV6_ROUTER_PREF
357                 else if (neigh->nud_state & NUD_FAILED)
358                         m = 0;
359 #endif
360                 else
361                         m = 1;
362                 read_unlock_bh(&neigh->lock);
363         } else
364                 m = 0;
365         return m;
366 }
367
368 static int rt6_score_route(struct rt6_info *rt, int oif,
369                            int strict)
370 {
371         int m, n;
372
373         m = rt6_check_dev(rt, oif);
374         if (!m && (strict & RT6_LOOKUP_F_IFACE))
375                 return -1;
376 #ifdef CONFIG_IPV6_ROUTER_PREF
377         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
378 #endif
379         n = rt6_check_neigh(rt);
380         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
381                 return -1;
382         return m;
383 }
384
385 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
386                                    int *mpri, struct rt6_info *match)
387 {
388         int m;
389
390         if (rt6_check_expired(rt))
391                 goto out;
392
393         m = rt6_score_route(rt, oif, strict);
394         if (m < 0)
395                 goto out;
396
397         if (m > *mpri) {
398                 if (strict & RT6_LOOKUP_F_REACHABLE)
399                         rt6_probe(match);
400                 *mpri = m;
401                 match = rt;
402         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
403                 rt6_probe(rt);
404         }
405
406 out:
407         return match;
408 }
409
410 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
411                                      struct rt6_info *rr_head,
412                                      u32 metric, int oif, int strict)
413 {
414         struct rt6_info *rt, *match;
415         int mpri = -1;
416
417         match = NULL;
418         for (rt = rr_head; rt && rt->rt6i_metric == metric;
419              rt = rt->dst.rt6_next)
420                 match = find_match(rt, oif, strict, &mpri, match);
421         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
422              rt = rt->dst.rt6_next)
423                 match = find_match(rt, oif, strict, &mpri, match);
424
425         return match;
426 }
427
428 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
429 {
430         struct rt6_info *match, *rt0;
431         struct net *net;
432
433         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
434                   __func__, fn->leaf, oif);
435
436         rt0 = fn->rr_ptr;
437         if (!rt0)
438                 fn->rr_ptr = rt0 = fn->leaf;
439
440         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
441
442         if (!match &&
443             (strict & RT6_LOOKUP_F_REACHABLE)) {
444                 struct rt6_info *next = rt0->dst.rt6_next;
445
446                 /* no entries matched; do round-robin */
447                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
448                         next = fn->leaf;
449
450                 if (next != rt0)
451                         fn->rr_ptr = next;
452         }
453
454         RT6_TRACE("%s() => %p\n",
455                   __func__, match);
456
457         net = dev_net(rt0->rt6i_dev);
458         return match ? match : net->ipv6.ip6_null_entry;
459 }
460
461 #ifdef CONFIG_IPV6_ROUTE_INFO
462 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
463                   struct in6_addr *gwaddr)
464 {
465         struct net *net = dev_net(dev);
466         struct route_info *rinfo = (struct route_info *) opt;
467         struct in6_addr prefix_buf, *prefix;
468         unsigned int pref;
469         unsigned long lifetime;
470         struct rt6_info *rt;
471
472         if (len < sizeof(struct route_info)) {
473                 return -EINVAL;
474         }
475
476         /* Sanity check for prefix_len and length */
477         if (rinfo->length > 3) {
478                 return -EINVAL;
479         } else if (rinfo->prefix_len > 128) {
480                 return -EINVAL;
481         } else if (rinfo->prefix_len > 64) {
482                 if (rinfo->length < 2) {
483                         return -EINVAL;
484                 }
485         } else if (rinfo->prefix_len > 0) {
486                 if (rinfo->length < 1) {
487                         return -EINVAL;
488                 }
489         }
490
491         pref = rinfo->route_pref;
492         if (pref == ICMPV6_ROUTER_PREF_INVALID)
493                 return -EINVAL;
494
495         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
496
497         if (rinfo->length == 3)
498                 prefix = (struct in6_addr *)rinfo->prefix;
499         else {
500                 /* this function is safe */
501                 ipv6_addr_prefix(&prefix_buf,
502                                  (struct in6_addr *)rinfo->prefix,
503                                  rinfo->prefix_len);
504                 prefix = &prefix_buf;
505         }
506
507         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
508                                 dev->ifindex);
509
510         if (rt && !lifetime) {
511                 ip6_del_rt(rt);
512                 rt = NULL;
513         }
514
515         if (!rt && lifetime)
516                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
517                                         pref);
518         else if (rt)
519                 rt->rt6i_flags = RTF_ROUTEINFO |
520                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
521
522         if (rt) {
523                 if (!addrconf_finite_timeout(lifetime)) {
524                         rt->rt6i_flags &= ~RTF_EXPIRES;
525                 } else {
526                         rt->rt6i_expires = jiffies + HZ * lifetime;
527                         rt->rt6i_flags |= RTF_EXPIRES;
528                 }
529                 dst_release(&rt->dst);
530         }
531         return 0;
532 }
533 #endif
534
535 #define BACKTRACK(__net, saddr)                 \
536 do { \
537         if (rt == __net->ipv6.ip6_null_entry) { \
538                 struct fib6_node *pn; \
539                 while (1) { \
540                         if (fn->fn_flags & RTN_TL_ROOT) \
541                                 goto out; \
542                         pn = fn->parent; \
543                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
544                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
545                         else \
546                                 fn = pn; \
547                         if (fn->fn_flags & RTN_RTINFO) \
548                                 goto restart; \
549                 } \
550         } \
551 } while(0)
552
553 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
554                                              struct fib6_table *table,
555                                              struct flowi *fl, int flags)
556 {
557         struct fib6_node *fn;
558         struct rt6_info *rt;
559
560         read_lock_bh(&table->tb6_lock);
561         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
562 restart:
563         rt = fn->leaf;
564         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
565         BACKTRACK(net, &fl->fl6_src);
566 out:
567         dst_use(&rt->dst, jiffies);
568         read_unlock_bh(&table->tb6_lock);
569         return rt;
570
571 }
572
573 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
574                             const struct in6_addr *saddr, int oif, int strict)
575 {
576         struct flowi fl = {
577                 .oif = oif,
578                 .fl6_dst = *daddr,
579         };
580         struct dst_entry *dst;
581         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
582
583         if (saddr) {
584                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
585                 flags |= RT6_LOOKUP_F_HAS_SADDR;
586         }
587
588         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
589         if (dst->error == 0)
590                 return (struct rt6_info *) dst;
591
592         dst_release(dst);
593
594         return NULL;
595 }
596
597 EXPORT_SYMBOL(rt6_lookup);
598
599 /* ip6_ins_rt is called with FREE table->tb6_lock.
600    It takes new route entry, the addition fails by any reason the
601    route is freed. In any case, if caller does not hold it, it may
602    be destroyed.
603  */
604
605 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
606 {
607         int err;
608         struct fib6_table *table;
609
610         table = rt->rt6i_table;
611         write_lock_bh(&table->tb6_lock);
612         err = fib6_add(&table->tb6_root, rt, info);
613         write_unlock_bh(&table->tb6_lock);
614
615         return err;
616 }
617
618 int ip6_ins_rt(struct rt6_info *rt)
619 {
620         struct nl_info info = {
621                 .nl_net = dev_net(rt->rt6i_dev),
622         };
623         return __ip6_ins_rt(rt, &info);
624 }
625
626 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
627                                       struct in6_addr *saddr)
628 {
629         struct rt6_info *rt;
630
631         /*
632          *      Clone the route.
633          */
634
635         rt = ip6_rt_copy(ort);
636
637         if (rt) {
638                 struct neighbour *neigh;
639                 int attempts = !in_softirq();
640
641                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
642                         if (rt->rt6i_dst.plen != 128 &&
643                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
644                                 rt->rt6i_flags |= RTF_ANYCAST;
645                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
646                 }
647
648                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
649                 rt->rt6i_dst.plen = 128;
650                 rt->rt6i_flags |= RTF_CACHE;
651                 rt->dst.flags |= DST_HOST;
652
653 #ifdef CONFIG_IPV6_SUBTREES
654                 if (rt->rt6i_src.plen && saddr) {
655                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
656                         rt->rt6i_src.plen = 128;
657                 }
658 #endif
659
660         retry:
661                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
662                 if (IS_ERR(neigh)) {
663                         struct net *net = dev_net(rt->rt6i_dev);
664                         int saved_rt_min_interval =
665                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
666                         int saved_rt_elasticity =
667                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
668
669                         if (attempts-- > 0) {
670                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
671                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
672
673                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
674
675                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
676                                         saved_rt_elasticity;
677                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
678                                         saved_rt_min_interval;
679                                 goto retry;
680                         }
681
682                         if (net_ratelimit())
683                                 printk(KERN_WARNING
684                                        "ipv6: Neighbour table overflow.\n");
685                         dst_free(&rt->dst);
686                         return NULL;
687                 }
688                 rt->rt6i_nexthop = neigh;
689
690         }
691
692         return rt;
693 }
694
695 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
696 {
697         struct rt6_info *rt = ip6_rt_copy(ort);
698         if (rt) {
699                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
700                 rt->rt6i_dst.plen = 128;
701                 rt->rt6i_flags |= RTF_CACHE;
702                 rt->dst.flags |= DST_HOST;
703                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
704         }
705         return rt;
706 }
707
708 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
709                                       struct flowi *fl, int flags)
710 {
711         struct fib6_node *fn;
712         struct rt6_info *rt, *nrt;
713         int strict = 0;
714         int attempts = 3;
715         int err;
716         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
717
718         strict |= flags & RT6_LOOKUP_F_IFACE;
719
720 relookup:
721         read_lock_bh(&table->tb6_lock);
722
723 restart_2:
724         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
725
726 restart:
727         rt = rt6_select(fn, oif, strict | reachable);
728
729         BACKTRACK(net, &fl->fl6_src);
730         if (rt == net->ipv6.ip6_null_entry ||
731             rt->rt6i_flags & RTF_CACHE)
732                 goto out;
733
734         dst_hold(&rt->dst);
735         read_unlock_bh(&table->tb6_lock);
736
737         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
738                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
739         else
740                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
741
742         dst_release(&rt->dst);
743         rt = nrt ? : net->ipv6.ip6_null_entry;
744
745         dst_hold(&rt->dst);
746         if (nrt) {
747                 err = ip6_ins_rt(nrt);
748                 if (!err)
749                         goto out2;
750         }
751
752         if (--attempts <= 0)
753                 goto out2;
754
755         /*
756          * Race condition! In the gap, when table->tb6_lock was
757          * released someone could insert this route.  Relookup.
758          */
759         dst_release(&rt->dst);
760         goto relookup;
761
762 out:
763         if (reachable) {
764                 reachable = 0;
765                 goto restart_2;
766         }
767         dst_hold(&rt->dst);
768         read_unlock_bh(&table->tb6_lock);
769 out2:
770         rt->dst.lastuse = jiffies;
771         rt->dst.__use++;
772
773         return rt;
774 }
775
776 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
777                                             struct flowi *fl, int flags)
778 {
779         return ip6_pol_route(net, table, fl->iif, fl, flags);
780 }
781
782 void ip6_route_input(struct sk_buff *skb)
783 {
784         struct ipv6hdr *iph = ipv6_hdr(skb);
785         struct net *net = dev_net(skb->dev);
786         int flags = RT6_LOOKUP_F_HAS_SADDR;
787         struct flowi fl = {
788                 .iif = skb->dev->ifindex,
789                 .fl6_dst = iph->daddr,
790                 .fl6_src = iph->saddr,
791                 .fl6_flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
792                 .mark = skb->mark,
793                 .proto = iph->nexthdr,
794         };
795
796         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
797                 flags |= RT6_LOOKUP_F_IFACE;
798
799         skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
800 }
801
802 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
803                                              struct flowi *fl, int flags)
804 {
805         return ip6_pol_route(net, table, fl->oif, fl, flags);
806 }
807
808 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
809                                     struct flowi *fl)
810 {
811         int flags = 0;
812
813         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
814                 flags |= RT6_LOOKUP_F_IFACE;
815
816         if (!ipv6_addr_any(&fl->fl6_src))
817                 flags |= RT6_LOOKUP_F_HAS_SADDR;
818         else if (sk)
819                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
820
821         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
822 }
823
824 EXPORT_SYMBOL(ip6_route_output);
825
826 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
827 {
828         struct rt6_info *ort = (struct rt6_info *) *dstp;
829         struct rt6_info *rt = (struct rt6_info *)
830                 dst_alloc(&ip6_dst_blackhole_ops);
831         struct dst_entry *new = NULL;
832
833         if (rt) {
834                 new = &rt->dst;
835
836                 atomic_set(&new->__refcnt, 1);
837                 new->__use = 1;
838                 new->input = dst_discard;
839                 new->output = dst_discard;
840
841                 dst_copy_metrics(new, &ort->dst);
842                 new->dev = ort->dst.dev;
843                 if (new->dev)
844                         dev_hold(new->dev);
845                 rt->rt6i_idev = ort->rt6i_idev;
846                 if (rt->rt6i_idev)
847                         in6_dev_hold(rt->rt6i_idev);
848                 rt->rt6i_expires = 0;
849
850                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
851                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
852                 rt->rt6i_metric = 0;
853
854                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
855 #ifdef CONFIG_IPV6_SUBTREES
856                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
857 #endif
858
859                 dst_free(new);
860         }
861
862         dst_release(*dstp);
863         *dstp = new;
864         return new ? 0 : -ENOMEM;
865 }
866 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
867
868 /*
869  *      Destination cache support functions
870  */
871
872 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
873 {
874         struct rt6_info *rt;
875
876         rt = (struct rt6_info *) dst;
877
878         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
879                 return dst;
880
881         return NULL;
882 }
883
884 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
885 {
886         struct rt6_info *rt = (struct rt6_info *) dst;
887
888         if (rt) {
889                 if (rt->rt6i_flags & RTF_CACHE) {
890                         if (rt6_check_expired(rt)) {
891                                 ip6_del_rt(rt);
892                                 dst = NULL;
893                         }
894                 } else {
895                         dst_release(dst);
896                         dst = NULL;
897                 }
898         }
899         return dst;
900 }
901
902 static void ip6_link_failure(struct sk_buff *skb)
903 {
904         struct rt6_info *rt;
905
906         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
907
908         rt = (struct rt6_info *) skb_dst(skb);
909         if (rt) {
910                 if (rt->rt6i_flags&RTF_CACHE) {
911                         dst_set_expires(&rt->dst, 0);
912                         rt->rt6i_flags |= RTF_EXPIRES;
913                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
914                         rt->rt6i_node->fn_sernum = -1;
915         }
916 }
917
918 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
919 {
920         struct rt6_info *rt6 = (struct rt6_info*)dst;
921
922         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
923                 rt6->rt6i_flags |= RTF_MODIFIED;
924                 if (mtu < IPV6_MIN_MTU) {
925                         u32 features = dst_metric(dst, RTAX_FEATURES);
926                         mtu = IPV6_MIN_MTU;
927                         features |= RTAX_FEATURE_ALLFRAG;
928                         dst_metric_set(dst, RTAX_FEATURES, features);
929                 }
930                 dst_metric_set(dst, RTAX_MTU, mtu);
931                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
932         }
933 }
934
935 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
936 {
937         struct net_device *dev = dst->dev;
938         unsigned int mtu = dst_mtu(dst);
939         struct net *net = dev_net(dev);
940
941         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
942
943         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
944                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
945
946         /*
947          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
948          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
949          * IPV6_MAXPLEN is also valid and means: "any MSS,
950          * rely only on pmtu discovery"
951          */
952         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
953                 mtu = IPV6_MAXPLEN;
954         return mtu;
955 }
956
957 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
958 {
959         unsigned int mtu = IPV6_MIN_MTU;
960         struct inet6_dev *idev;
961
962         rcu_read_lock();
963         idev = __in6_dev_get(dst->dev);
964         if (idev)
965                 mtu = idev->cnf.mtu6;
966         rcu_read_unlock();
967
968         return mtu;
969 }
970
971 static struct dst_entry *icmp6_dst_gc_list;
972 static DEFINE_SPINLOCK(icmp6_dst_lock);
973
974 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
975                                   struct neighbour *neigh,
976                                   const struct in6_addr *addr)
977 {
978         struct rt6_info *rt;
979         struct inet6_dev *idev = in6_dev_get(dev);
980         struct net *net = dev_net(dev);
981
982         if (unlikely(idev == NULL))
983                 return NULL;
984
985         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
986         if (unlikely(rt == NULL)) {
987                 in6_dev_put(idev);
988                 goto out;
989         }
990
991         dev_hold(dev);
992         if (neigh)
993                 neigh_hold(neigh);
994         else {
995                 neigh = ndisc_get_neigh(dev, addr);
996                 if (IS_ERR(neigh))
997                         neigh = NULL;
998         }
999
1000         rt->rt6i_dev      = dev;
1001         rt->rt6i_idev     = idev;
1002         rt->rt6i_nexthop  = neigh;
1003         atomic_set(&rt->dst.__refcnt, 1);
1004         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1005         rt->dst.output  = ip6_output;
1006
1007 #if 0   /* there's no chance to use these for ndisc */
1008         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1009                                 ? DST_HOST
1010                                 : 0;
1011         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1012         rt->rt6i_dst.plen = 128;
1013 #endif
1014
1015         spin_lock_bh(&icmp6_dst_lock);
1016         rt->dst.next = icmp6_dst_gc_list;
1017         icmp6_dst_gc_list = &rt->dst;
1018         spin_unlock_bh(&icmp6_dst_lock);
1019
1020         fib6_force_start_gc(net);
1021
1022 out:
1023         return &rt->dst;
1024 }
1025
1026 int icmp6_dst_gc(void)
1027 {
1028         struct dst_entry *dst, *next, **pprev;
1029         int more = 0;
1030
1031         next = NULL;
1032
1033         spin_lock_bh(&icmp6_dst_lock);
1034         pprev = &icmp6_dst_gc_list;
1035
1036         while ((dst = *pprev) != NULL) {
1037                 if (!atomic_read(&dst->__refcnt)) {
1038                         *pprev = dst->next;
1039                         dst_free(dst);
1040                 } else {
1041                         pprev = &dst->next;
1042                         ++more;
1043                 }
1044         }
1045
1046         spin_unlock_bh(&icmp6_dst_lock);
1047
1048         return more;
1049 }
1050
1051 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1052                             void *arg)
1053 {
1054         struct dst_entry *dst, **pprev;
1055
1056         spin_lock_bh(&icmp6_dst_lock);
1057         pprev = &icmp6_dst_gc_list;
1058         while ((dst = *pprev) != NULL) {
1059                 struct rt6_info *rt = (struct rt6_info *) dst;
1060                 if (func(rt, arg)) {
1061                         *pprev = dst->next;
1062                         dst_free(dst);
1063                 } else {
1064                         pprev = &dst->next;
1065                 }
1066         }
1067         spin_unlock_bh(&icmp6_dst_lock);
1068 }
1069
1070 static int ip6_dst_gc(struct dst_ops *ops)
1071 {
1072         unsigned long now = jiffies;
1073         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1074         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1075         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1076         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1077         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1078         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1079         int entries;
1080
1081         entries = dst_entries_get_fast(ops);
1082         if (time_after(rt_last_gc + rt_min_interval, now) &&
1083             entries <= rt_max_size)
1084                 goto out;
1085
1086         net->ipv6.ip6_rt_gc_expire++;
1087         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1088         net->ipv6.ip6_rt_last_gc = now;
1089         entries = dst_entries_get_slow(ops);
1090         if (entries < ops->gc_thresh)
1091                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1092 out:
1093         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1094         return entries > rt_max_size;
1095 }
1096
1097 /* Clean host part of a prefix. Not necessary in radix tree,
1098    but results in cleaner routing tables.
1099
1100    Remove it only when all the things will work!
1101  */
1102
1103 int ip6_dst_hoplimit(struct dst_entry *dst)
1104 {
1105         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1106         if (hoplimit == 0) {
1107                 struct net_device *dev = dst->dev;
1108                 struct inet6_dev *idev;
1109
1110                 rcu_read_lock();
1111                 idev = __in6_dev_get(dev);
1112                 if (idev)
1113                         hoplimit = idev->cnf.hop_limit;
1114                 else
1115                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1116                 rcu_read_unlock();
1117         }
1118         return hoplimit;
1119 }
1120 EXPORT_SYMBOL(ip6_dst_hoplimit);
1121
1122 /*
1123  *
1124  */
1125
1126 int ip6_route_add(struct fib6_config *cfg)
1127 {
1128         int err;
1129         struct net *net = cfg->fc_nlinfo.nl_net;
1130         struct rt6_info *rt = NULL;
1131         struct net_device *dev = NULL;
1132         struct inet6_dev *idev = NULL;
1133         struct fib6_table *table;
1134         int addr_type;
1135
1136         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1137                 return -EINVAL;
1138 #ifndef CONFIG_IPV6_SUBTREES
1139         if (cfg->fc_src_len)
1140                 return -EINVAL;
1141 #endif
1142         if (cfg->fc_ifindex) {
1143                 err = -ENODEV;
1144                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1145                 if (!dev)
1146                         goto out;
1147                 idev = in6_dev_get(dev);
1148                 if (!idev)
1149                         goto out;
1150         }
1151
1152         if (cfg->fc_metric == 0)
1153                 cfg->fc_metric = IP6_RT_PRIO_USER;
1154
1155         table = fib6_new_table(net, cfg->fc_table);
1156         if (table == NULL) {
1157                 err = -ENOBUFS;
1158                 goto out;
1159         }
1160
1161         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1162
1163         if (rt == NULL) {
1164                 err = -ENOMEM;
1165                 goto out;
1166         }
1167
1168         rt->dst.obsolete = -1;
1169         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1170                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1171                                 0;
1172
1173         if (cfg->fc_protocol == RTPROT_UNSPEC)
1174                 cfg->fc_protocol = RTPROT_BOOT;
1175         rt->rt6i_protocol = cfg->fc_protocol;
1176
1177         addr_type = ipv6_addr_type(&cfg->fc_dst);
1178
1179         if (addr_type & IPV6_ADDR_MULTICAST)
1180                 rt->dst.input = ip6_mc_input;
1181         else if (cfg->fc_flags & RTF_LOCAL)
1182                 rt->dst.input = ip6_input;
1183         else
1184                 rt->dst.input = ip6_forward;
1185
1186         rt->dst.output = ip6_output;
1187
1188         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1189         rt->rt6i_dst.plen = cfg->fc_dst_len;
1190         if (rt->rt6i_dst.plen == 128)
1191                rt->dst.flags = DST_HOST;
1192
1193 #ifdef CONFIG_IPV6_SUBTREES
1194         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1195         rt->rt6i_src.plen = cfg->fc_src_len;
1196 #endif
1197
1198         rt->rt6i_metric = cfg->fc_metric;
1199
1200         /* We cannot add true routes via loopback here,
1201            they would result in kernel looping; promote them to reject routes
1202          */
1203         if ((cfg->fc_flags & RTF_REJECT) ||
1204             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1205                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1206                 /* hold loopback dev/idev if we haven't done so. */
1207                 if (dev != net->loopback_dev) {
1208                         if (dev) {
1209                                 dev_put(dev);
1210                                 in6_dev_put(idev);
1211                         }
1212                         dev = net->loopback_dev;
1213                         dev_hold(dev);
1214                         idev = in6_dev_get(dev);
1215                         if (!idev) {
1216                                 err = -ENODEV;
1217                                 goto out;
1218                         }
1219                 }
1220                 rt->dst.output = ip6_pkt_discard_out;
1221                 rt->dst.input = ip6_pkt_discard;
1222                 rt->dst.error = -ENETUNREACH;
1223                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1224                 goto install_route;
1225         }
1226
1227         if (cfg->fc_flags & RTF_GATEWAY) {
1228                 struct in6_addr *gw_addr;
1229                 int gwa_type;
1230
1231                 gw_addr = &cfg->fc_gateway;
1232                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1233                 gwa_type = ipv6_addr_type(gw_addr);
1234
1235                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1236                         struct rt6_info *grt;
1237
1238                         /* IPv6 strictly inhibits using not link-local
1239                            addresses as nexthop address.
1240                            Otherwise, router will not able to send redirects.
1241                            It is very good, but in some (rare!) circumstances
1242                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1243                            some exceptions. --ANK
1244                          */
1245                         err = -EINVAL;
1246                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1247                                 goto out;
1248
1249                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1250
1251                         err = -EHOSTUNREACH;
1252                         if (grt == NULL)
1253                                 goto out;
1254                         if (dev) {
1255                                 if (dev != grt->rt6i_dev) {
1256                                         dst_release(&grt->dst);
1257                                         goto out;
1258                                 }
1259                         } else {
1260                                 dev = grt->rt6i_dev;
1261                                 idev = grt->rt6i_idev;
1262                                 dev_hold(dev);
1263                                 in6_dev_hold(grt->rt6i_idev);
1264                         }
1265                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1266                                 err = 0;
1267                         dst_release(&grt->dst);
1268
1269                         if (err)
1270                                 goto out;
1271                 }
1272                 err = -EINVAL;
1273                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1274                         goto out;
1275         }
1276
1277         err = -ENODEV;
1278         if (dev == NULL)
1279                 goto out;
1280
1281         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1282                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1283                 if (IS_ERR(rt->rt6i_nexthop)) {
1284                         err = PTR_ERR(rt->rt6i_nexthop);
1285                         rt->rt6i_nexthop = NULL;
1286                         goto out;
1287                 }
1288         }
1289
1290         rt->rt6i_flags = cfg->fc_flags;
1291
1292 install_route:
1293         if (cfg->fc_mx) {
1294                 struct nlattr *nla;
1295                 int remaining;
1296
1297                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1298                         int type = nla_type(nla);
1299
1300                         if (type) {
1301                                 if (type > RTAX_MAX) {
1302                                         err = -EINVAL;
1303                                         goto out;
1304                                 }
1305
1306                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1307                         }
1308                 }
1309         }
1310
1311         rt->dst.dev = dev;
1312         rt->rt6i_idev = idev;
1313         rt->rt6i_table = table;
1314
1315         cfg->fc_nlinfo.nl_net = dev_net(dev);
1316
1317         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1318
1319 out:
1320         if (dev)
1321                 dev_put(dev);
1322         if (idev)
1323                 in6_dev_put(idev);
1324         if (rt)
1325                 dst_free(&rt->dst);
1326         return err;
1327 }
1328
1329 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1330 {
1331         int err;
1332         struct fib6_table *table;
1333         struct net *net = dev_net(rt->rt6i_dev);
1334
1335         if (rt == net->ipv6.ip6_null_entry)
1336                 return -ENOENT;
1337
1338         table = rt->rt6i_table;
1339         write_lock_bh(&table->tb6_lock);
1340
1341         err = fib6_del(rt, info);
1342         dst_release(&rt->dst);
1343
1344         write_unlock_bh(&table->tb6_lock);
1345
1346         return err;
1347 }
1348
1349 int ip6_del_rt(struct rt6_info *rt)
1350 {
1351         struct nl_info info = {
1352                 .nl_net = dev_net(rt->rt6i_dev),
1353         };
1354         return __ip6_del_rt(rt, &info);
1355 }
1356
1357 static int ip6_route_del(struct fib6_config *cfg)
1358 {
1359         struct fib6_table *table;
1360         struct fib6_node *fn;
1361         struct rt6_info *rt;
1362         int err = -ESRCH;
1363
1364         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1365         if (table == NULL)
1366                 return err;
1367
1368         read_lock_bh(&table->tb6_lock);
1369
1370         fn = fib6_locate(&table->tb6_root,
1371                          &cfg->fc_dst, cfg->fc_dst_len,
1372                          &cfg->fc_src, cfg->fc_src_len);
1373
1374         if (fn) {
1375                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1376                         if (cfg->fc_ifindex &&
1377                             (rt->rt6i_dev == NULL ||
1378                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1379                                 continue;
1380                         if (cfg->fc_flags & RTF_GATEWAY &&
1381                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1382                                 continue;
1383                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1384                                 continue;
1385                         dst_hold(&rt->dst);
1386                         read_unlock_bh(&table->tb6_lock);
1387
1388                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1389                 }
1390         }
1391         read_unlock_bh(&table->tb6_lock);
1392
1393         return err;
1394 }
1395
1396 /*
1397  *      Handle redirects
1398  */
1399 struct ip6rd_flowi {
1400         struct flowi fl;
1401         struct in6_addr gateway;
1402 };
1403
1404 static struct rt6_info *__ip6_route_redirect(struct net *net,
1405                                              struct fib6_table *table,
1406                                              struct flowi *fl,
1407                                              int flags)
1408 {
1409         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1410         struct rt6_info *rt;
1411         struct fib6_node *fn;
1412
1413         /*
1414          * Get the "current" route for this destination and
1415          * check if the redirect has come from approriate router.
1416          *
1417          * RFC 2461 specifies that redirects should only be
1418          * accepted if they come from the nexthop to the target.
1419          * Due to the way the routes are chosen, this notion
1420          * is a bit fuzzy and one might need to check all possible
1421          * routes.
1422          */
1423
1424         read_lock_bh(&table->tb6_lock);
1425         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1426 restart:
1427         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1428                 /*
1429                  * Current route is on-link; redirect is always invalid.
1430                  *
1431                  * Seems, previous statement is not true. It could
1432                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1433                  * But then router serving it might decide, that we should
1434                  * know truth 8)8) --ANK (980726).
1435                  */
1436                 if (rt6_check_expired(rt))
1437                         continue;
1438                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1439                         continue;
1440                 if (fl->oif != rt->rt6i_dev->ifindex)
1441                         continue;
1442                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1443                         continue;
1444                 break;
1445         }
1446
1447         if (!rt)
1448                 rt = net->ipv6.ip6_null_entry;
1449         BACKTRACK(net, &fl->fl6_src);
1450 out:
1451         dst_hold(&rt->dst);
1452
1453         read_unlock_bh(&table->tb6_lock);
1454
1455         return rt;
1456 };
1457
1458 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1459                                            struct in6_addr *src,
1460                                            struct in6_addr *gateway,
1461                                            struct net_device *dev)
1462 {
1463         int flags = RT6_LOOKUP_F_HAS_SADDR;
1464         struct net *net = dev_net(dev);
1465         struct ip6rd_flowi rdfl = {
1466                 .fl = {
1467                         .oif = dev->ifindex,
1468                         .fl6_dst = *dest,
1469                         .fl6_src = *src,
1470                 },
1471         };
1472
1473         ipv6_addr_copy(&rdfl.gateway, gateway);
1474
1475         if (rt6_need_strict(dest))
1476                 flags |= RT6_LOOKUP_F_IFACE;
1477
1478         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1479                                                    flags, __ip6_route_redirect);
1480 }
1481
1482 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1483                   struct in6_addr *saddr,
1484                   struct neighbour *neigh, u8 *lladdr, int on_link)
1485 {
1486         struct rt6_info *rt, *nrt = NULL;
1487         struct netevent_redirect netevent;
1488         struct net *net = dev_net(neigh->dev);
1489
1490         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1491
1492         if (rt == net->ipv6.ip6_null_entry) {
1493                 if (net_ratelimit())
1494                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1495                                "for redirect target\n");
1496                 goto out;
1497         }
1498
1499         /*
1500          *      We have finally decided to accept it.
1501          */
1502
1503         neigh_update(neigh, lladdr, NUD_STALE,
1504                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1505                      NEIGH_UPDATE_F_OVERRIDE|
1506                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1507                                      NEIGH_UPDATE_F_ISROUTER))
1508                      );
1509
1510         /*
1511          * Redirect received -> path was valid.
1512          * Look, redirects are sent only in response to data packets,
1513          * so that this nexthop apparently is reachable. --ANK
1514          */
1515         dst_confirm(&rt->dst);
1516
1517         /* Duplicate redirect: silently ignore. */
1518         if (neigh == rt->dst.neighbour)
1519                 goto out;
1520
1521         nrt = ip6_rt_copy(rt);
1522         if (nrt == NULL)
1523                 goto out;
1524
1525         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1526         if (on_link)
1527                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1528
1529         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1530         nrt->rt6i_dst.plen = 128;
1531         nrt->dst.flags |= DST_HOST;
1532
1533         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1534         nrt->rt6i_nexthop = neigh_clone(neigh);
1535
1536         if (ip6_ins_rt(nrt))
1537                 goto out;
1538
1539         netevent.old = &rt->dst;
1540         netevent.new = &nrt->dst;
1541         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1542
1543         if (rt->rt6i_flags&RTF_CACHE) {
1544                 ip6_del_rt(rt);
1545                 return;
1546         }
1547
1548 out:
1549         dst_release(&rt->dst);
1550 }
1551
1552 /*
1553  *      Handle ICMP "packet too big" messages
1554  *      i.e. Path MTU discovery
1555  */
1556
1557 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1558                              struct net *net, u32 pmtu, int ifindex)
1559 {
1560         struct rt6_info *rt, *nrt;
1561         int allfrag = 0;
1562 again:
1563         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1564         if (rt == NULL)
1565                 return;
1566
1567         if (rt6_check_expired(rt)) {
1568                 ip6_del_rt(rt);
1569                 goto again;
1570         }
1571
1572         if (pmtu >= dst_mtu(&rt->dst))
1573                 goto out;
1574
1575         if (pmtu < IPV6_MIN_MTU) {
1576                 /*
1577                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1578                  * MTU (1280) and a fragment header should always be included
1579                  * after a node receiving Too Big message reporting PMTU is
1580                  * less than the IPv6 Minimum Link MTU.
1581                  */
1582                 pmtu = IPV6_MIN_MTU;
1583                 allfrag = 1;
1584         }
1585
1586         /* New mtu received -> path was valid.
1587            They are sent only in response to data packets,
1588            so that this nexthop apparently is reachable. --ANK
1589          */
1590         dst_confirm(&rt->dst);
1591
1592         /* Host route. If it is static, it would be better
1593            not to override it, but add new one, so that
1594            when cache entry will expire old pmtu
1595            would return automatically.
1596          */
1597         if (rt->rt6i_flags & RTF_CACHE) {
1598                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1599                 if (allfrag) {
1600                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1601                         features |= RTAX_FEATURE_ALLFRAG;
1602                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1603                 }
1604                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1605                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1606                 goto out;
1607         }
1608
1609         /* Network route.
1610            Two cases are possible:
1611            1. It is connected route. Action: COW
1612            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1613          */
1614         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1615                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1616         else
1617                 nrt = rt6_alloc_clone(rt, daddr);
1618
1619         if (nrt) {
1620                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1621                 if (allfrag) {
1622                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1623                         features |= RTAX_FEATURE_ALLFRAG;
1624                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1625                 }
1626
1627                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1628                  * happened within 5 mins, the recommended timer is 10 mins.
1629                  * Here this route expiration time is set to ip6_rt_mtu_expires
1630                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1631                  * and detecting PMTU increase will be automatically happened.
1632                  */
1633                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1634                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1635
1636                 ip6_ins_rt(nrt);
1637         }
1638 out:
1639         dst_release(&rt->dst);
1640 }
1641
1642 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1643                         struct net_device *dev, u32 pmtu)
1644 {
1645         struct net *net = dev_net(dev);
1646
1647         /*
1648          * RFC 1981 states that a node "MUST reduce the size of the packets it
1649          * is sending along the path" that caused the Packet Too Big message.
1650          * Since it's not possible in the general case to determine which
1651          * interface was used to send the original packet, we update the MTU
1652          * on the interface that will be used to send future packets. We also
1653          * update the MTU on the interface that received the Packet Too Big in
1654          * case the original packet was forced out that interface with
1655          * SO_BINDTODEVICE or similar. This is the next best thing to the
1656          * correct behaviour, which would be to update the MTU on all
1657          * interfaces.
1658          */
1659         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1660         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1661 }
1662
1663 /*
1664  *      Misc support functions
1665  */
1666
1667 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1668 {
1669         struct net *net = dev_net(ort->rt6i_dev);
1670         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1671
1672         if (rt) {
1673                 rt->dst.input = ort->dst.input;
1674                 rt->dst.output = ort->dst.output;
1675
1676                 dst_copy_metrics(&rt->dst, &ort->dst);
1677                 rt->dst.error = ort->dst.error;
1678                 rt->dst.dev = ort->dst.dev;
1679                 if (rt->dst.dev)
1680                         dev_hold(rt->dst.dev);
1681                 rt->rt6i_idev = ort->rt6i_idev;
1682                 if (rt->rt6i_idev)
1683                         in6_dev_hold(rt->rt6i_idev);
1684                 rt->dst.lastuse = jiffies;
1685                 rt->rt6i_expires = 0;
1686
1687                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1688                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1689                 rt->rt6i_metric = 0;
1690
1691                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1692 #ifdef CONFIG_IPV6_SUBTREES
1693                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1694 #endif
1695                 rt->rt6i_table = ort->rt6i_table;
1696         }
1697         return rt;
1698 }
1699
1700 #ifdef CONFIG_IPV6_ROUTE_INFO
1701 static struct rt6_info *rt6_get_route_info(struct net *net,
1702                                            struct in6_addr *prefix, int prefixlen,
1703                                            struct in6_addr *gwaddr, int ifindex)
1704 {
1705         struct fib6_node *fn;
1706         struct rt6_info *rt = NULL;
1707         struct fib6_table *table;
1708
1709         table = fib6_get_table(net, RT6_TABLE_INFO);
1710         if (table == NULL)
1711                 return NULL;
1712
1713         write_lock_bh(&table->tb6_lock);
1714         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1715         if (!fn)
1716                 goto out;
1717
1718         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1719                 if (rt->rt6i_dev->ifindex != ifindex)
1720                         continue;
1721                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1722                         continue;
1723                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1724                         continue;
1725                 dst_hold(&rt->dst);
1726                 break;
1727         }
1728 out:
1729         write_unlock_bh(&table->tb6_lock);
1730         return rt;
1731 }
1732
1733 static struct rt6_info *rt6_add_route_info(struct net *net,
1734                                            struct in6_addr *prefix, int prefixlen,
1735                                            struct in6_addr *gwaddr, int ifindex,
1736                                            unsigned pref)
1737 {
1738         struct fib6_config cfg = {
1739                 .fc_table       = RT6_TABLE_INFO,
1740                 .fc_metric      = IP6_RT_PRIO_USER,
1741                 .fc_ifindex     = ifindex,
1742                 .fc_dst_len     = prefixlen,
1743                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1744                                   RTF_UP | RTF_PREF(pref),
1745                 .fc_nlinfo.pid = 0,
1746                 .fc_nlinfo.nlh = NULL,
1747                 .fc_nlinfo.nl_net = net,
1748         };
1749
1750         ipv6_addr_copy(&cfg.fc_dst, prefix);
1751         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1752
1753         /* We should treat it as a default route if prefix length is 0. */
1754         if (!prefixlen)
1755                 cfg.fc_flags |= RTF_DEFAULT;
1756
1757         ip6_route_add(&cfg);
1758
1759         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1760 }
1761 #endif
1762
1763 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1764 {
1765         struct rt6_info *rt;
1766         struct fib6_table *table;
1767
1768         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1769         if (table == NULL)
1770                 return NULL;
1771
1772         write_lock_bh(&table->tb6_lock);
1773         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1774                 if (dev == rt->rt6i_dev &&
1775                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1776                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1777                         break;
1778         }
1779         if (rt)
1780                 dst_hold(&rt->dst);
1781         write_unlock_bh(&table->tb6_lock);
1782         return rt;
1783 }
1784
1785 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1786                                      struct net_device *dev,
1787                                      unsigned int pref)
1788 {
1789         struct fib6_config cfg = {
1790                 .fc_table       = RT6_TABLE_DFLT,
1791                 .fc_metric      = IP6_RT_PRIO_USER,
1792                 .fc_ifindex     = dev->ifindex,
1793                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1794                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1795                 .fc_nlinfo.pid = 0,
1796                 .fc_nlinfo.nlh = NULL,
1797                 .fc_nlinfo.nl_net = dev_net(dev),
1798         };
1799
1800         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1801
1802         ip6_route_add(&cfg);
1803
1804         return rt6_get_dflt_router(gwaddr, dev);
1805 }
1806
1807 void rt6_purge_dflt_routers(struct net *net)
1808 {
1809         struct rt6_info *rt;
1810         struct fib6_table *table;
1811
1812         /* NOTE: Keep consistent with rt6_get_dflt_router */
1813         table = fib6_get_table(net, RT6_TABLE_DFLT);
1814         if (table == NULL)
1815                 return;
1816
1817 restart:
1818         read_lock_bh(&table->tb6_lock);
1819         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1820                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1821                         dst_hold(&rt->dst);
1822                         read_unlock_bh(&table->tb6_lock);
1823                         ip6_del_rt(rt);
1824                         goto restart;
1825                 }
1826         }
1827         read_unlock_bh(&table->tb6_lock);
1828 }
1829
1830 static void rtmsg_to_fib6_config(struct net *net,
1831                                  struct in6_rtmsg *rtmsg,
1832                                  struct fib6_config *cfg)
1833 {
1834         memset(cfg, 0, sizeof(*cfg));
1835
1836         cfg->fc_table = RT6_TABLE_MAIN;
1837         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1838         cfg->fc_metric = rtmsg->rtmsg_metric;
1839         cfg->fc_expires = rtmsg->rtmsg_info;
1840         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1841         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1842         cfg->fc_flags = rtmsg->rtmsg_flags;
1843
1844         cfg->fc_nlinfo.nl_net = net;
1845
1846         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1847         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1848         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1849 }
1850
1851 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1852 {
1853         struct fib6_config cfg;
1854         struct in6_rtmsg rtmsg;
1855         int err;
1856
1857         switch(cmd) {
1858         case SIOCADDRT:         /* Add a route */
1859         case SIOCDELRT:         /* Delete a route */
1860                 if (!capable(CAP_NET_ADMIN))
1861                         return -EPERM;
1862                 err = copy_from_user(&rtmsg, arg,
1863                                      sizeof(struct in6_rtmsg));
1864                 if (err)
1865                         return -EFAULT;
1866
1867                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1868
1869                 rtnl_lock();
1870                 switch (cmd) {
1871                 case SIOCADDRT:
1872                         err = ip6_route_add(&cfg);
1873                         break;
1874                 case SIOCDELRT:
1875                         err = ip6_route_del(&cfg);
1876                         break;
1877                 default:
1878                         err = -EINVAL;
1879                 }
1880                 rtnl_unlock();
1881
1882                 return err;
1883         }
1884
1885         return -EINVAL;
1886 }
1887
1888 /*
1889  *      Drop the packet on the floor
1890  */
1891
1892 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1893 {
1894         int type;
1895         struct dst_entry *dst = skb_dst(skb);
1896         switch (ipstats_mib_noroutes) {
1897         case IPSTATS_MIB_INNOROUTES:
1898                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1899                 if (type == IPV6_ADDR_ANY) {
1900                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1901                                       IPSTATS_MIB_INADDRERRORS);
1902                         break;
1903                 }
1904                 /* FALLTHROUGH */
1905         case IPSTATS_MIB_OUTNOROUTES:
1906                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1907                               ipstats_mib_noroutes);
1908                 break;
1909         }
1910         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1911         kfree_skb(skb);
1912         return 0;
1913 }
1914
1915 static int ip6_pkt_discard(struct sk_buff *skb)
1916 {
1917         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1918 }
1919
1920 static int ip6_pkt_discard_out(struct sk_buff *skb)
1921 {
1922         skb->dev = skb_dst(skb)->dev;
1923         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1924 }
1925
1926 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1927
1928 static int ip6_pkt_prohibit(struct sk_buff *skb)
1929 {
1930         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1931 }
1932
1933 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1934 {
1935         skb->dev = skb_dst(skb)->dev;
1936         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1937 }
1938
1939 #endif
1940
1941 /*
1942  *      Allocate a dst for local (unicast / anycast) address.
1943  */
1944
1945 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1946                                     const struct in6_addr *addr,
1947                                     int anycast)
1948 {
1949         struct net *net = dev_net(idev->dev);
1950         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1951         struct neighbour *neigh;
1952
1953         if (rt == NULL) {
1954                 if (net_ratelimit())
1955                         pr_warning("IPv6:  Maximum number of routes reached,"
1956                                    " consider increasing route/max_size.\n");
1957                 return ERR_PTR(-ENOMEM);
1958         }
1959
1960         dev_hold(net->loopback_dev);
1961         in6_dev_hold(idev);
1962
1963         rt->dst.flags = DST_HOST;
1964         rt->dst.input = ip6_input;
1965         rt->dst.output = ip6_output;
1966         rt->rt6i_dev = net->loopback_dev;
1967         rt->rt6i_idev = idev;
1968         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
1969         rt->dst.obsolete = -1;
1970
1971         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1972         if (anycast)
1973                 rt->rt6i_flags |= RTF_ANYCAST;
1974         else
1975                 rt->rt6i_flags |= RTF_LOCAL;
1976         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1977         if (IS_ERR(neigh)) {
1978                 dst_free(&rt->dst);
1979
1980                 /* We are casting this because that is the return
1981                  * value type.  But an errno encoded pointer is the
1982                  * same regardless of the underlying pointer type,
1983                  * and that's what we are returning.  So this is OK.
1984                  */
1985                 return (struct rt6_info *) neigh;
1986         }
1987         rt->rt6i_nexthop = neigh;
1988
1989         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1990         rt->rt6i_dst.plen = 128;
1991         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1992
1993         atomic_set(&rt->dst.__refcnt, 1);
1994
1995         return rt;
1996 }
1997
1998 struct arg_dev_net {
1999         struct net_device *dev;
2000         struct net *net;
2001 };
2002
2003 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2004 {
2005         const struct arg_dev_net *adn = arg;
2006         const struct net_device *dev = adn->dev;
2007
2008         if ((rt->rt6i_dev == dev || dev == NULL) &&
2009             rt != adn->net->ipv6.ip6_null_entry) {
2010                 RT6_TRACE("deleted by ifdown %p\n", rt);
2011                 return -1;
2012         }
2013         return 0;
2014 }
2015
2016 void rt6_ifdown(struct net *net, struct net_device *dev)
2017 {
2018         struct arg_dev_net adn = {
2019                 .dev = dev,
2020                 .net = net,
2021         };
2022
2023         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2024         icmp6_clean_all(fib6_ifdown, &adn);
2025 }
2026
2027 struct rt6_mtu_change_arg
2028 {
2029         struct net_device *dev;
2030         unsigned mtu;
2031 };
2032
2033 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2034 {
2035         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2036         struct inet6_dev *idev;
2037
2038         /* In IPv6 pmtu discovery is not optional,
2039            so that RTAX_MTU lock cannot disable it.
2040            We still use this lock to block changes
2041            caused by addrconf/ndisc.
2042         */
2043
2044         idev = __in6_dev_get(arg->dev);
2045         if (idev == NULL)
2046                 return 0;
2047
2048         /* For administrative MTU increase, there is no way to discover
2049            IPv6 PMTU increase, so PMTU increase should be updated here.
2050            Since RFC 1981 doesn't include administrative MTU increase
2051            update PMTU increase is a MUST. (i.e. jumbo frame)
2052          */
2053         /*
2054            If new MTU is less than route PMTU, this new MTU will be the
2055            lowest MTU in the path, update the route PMTU to reflect PMTU
2056            decreases; if new MTU is greater than route PMTU, and the
2057            old MTU is the lowest MTU in the path, update the route PMTU
2058            to reflect the increase. In this case if the other nodes' MTU
2059            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2060            PMTU discouvery.
2061          */
2062         if (rt->rt6i_dev == arg->dev &&
2063             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2064             (dst_mtu(&rt->dst) >= arg->mtu ||
2065              (dst_mtu(&rt->dst) < arg->mtu &&
2066               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2067                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2068         }
2069         return 0;
2070 }
2071
2072 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2073 {
2074         struct rt6_mtu_change_arg arg = {
2075                 .dev = dev,
2076                 .mtu = mtu,
2077         };
2078
2079         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2080 }
2081
2082 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2083         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2084         [RTA_OIF]               = { .type = NLA_U32 },
2085         [RTA_IIF]               = { .type = NLA_U32 },
2086         [RTA_PRIORITY]          = { .type = NLA_U32 },
2087         [RTA_METRICS]           = { .type = NLA_NESTED },
2088 };
2089
2090 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2091                               struct fib6_config *cfg)
2092 {
2093         struct rtmsg *rtm;
2094         struct nlattr *tb[RTA_MAX+1];
2095         int err;
2096
2097         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2098         if (err < 0)
2099                 goto errout;
2100
2101         err = -EINVAL;
2102         rtm = nlmsg_data(nlh);
2103         memset(cfg, 0, sizeof(*cfg));
2104
2105         cfg->fc_table = rtm->rtm_table;
2106         cfg->fc_dst_len = rtm->rtm_dst_len;
2107         cfg->fc_src_len = rtm->rtm_src_len;
2108         cfg->fc_flags = RTF_UP;
2109         cfg->fc_protocol = rtm->rtm_protocol;
2110
2111         if (rtm->rtm_type == RTN_UNREACHABLE)
2112                 cfg->fc_flags |= RTF_REJECT;
2113
2114         if (rtm->rtm_type == RTN_LOCAL)
2115                 cfg->fc_flags |= RTF_LOCAL;
2116
2117         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2118         cfg->fc_nlinfo.nlh = nlh;
2119         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2120
2121         if (tb[RTA_GATEWAY]) {
2122                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2123                 cfg->fc_flags |= RTF_GATEWAY;
2124         }
2125
2126         if (tb[RTA_DST]) {
2127                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2128
2129                 if (nla_len(tb[RTA_DST]) < plen)
2130                         goto errout;
2131
2132                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2133         }
2134
2135         if (tb[RTA_SRC]) {
2136                 int plen = (rtm->rtm_src_len + 7) >> 3;
2137
2138                 if (nla_len(tb[RTA_SRC]) < plen)
2139                         goto errout;
2140
2141                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2142         }
2143
2144         if (tb[RTA_OIF])
2145                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2146
2147         if (tb[RTA_PRIORITY])
2148                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2149
2150         if (tb[RTA_METRICS]) {
2151                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2152                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2153         }
2154
2155         if (tb[RTA_TABLE])
2156                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2157
2158         err = 0;
2159 errout:
2160         return err;
2161 }
2162
2163 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2164 {
2165         struct fib6_config cfg;
2166         int err;
2167
2168         err = rtm_to_fib6_config(skb, nlh, &cfg);
2169         if (err < 0)
2170                 return err;
2171
2172         return ip6_route_del(&cfg);
2173 }
2174
2175 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2176 {
2177         struct fib6_config cfg;
2178         int err;
2179
2180         err = rtm_to_fib6_config(skb, nlh, &cfg);
2181         if (err < 0)
2182                 return err;
2183
2184         return ip6_route_add(&cfg);
2185 }
2186
2187 static inline size_t rt6_nlmsg_size(void)
2188 {
2189         return NLMSG_ALIGN(sizeof(struct rtmsg))
2190                + nla_total_size(16) /* RTA_SRC */
2191                + nla_total_size(16) /* RTA_DST */
2192                + nla_total_size(16) /* RTA_GATEWAY */
2193                + nla_total_size(16) /* RTA_PREFSRC */
2194                + nla_total_size(4) /* RTA_TABLE */
2195                + nla_total_size(4) /* RTA_IIF */
2196                + nla_total_size(4) /* RTA_OIF */
2197                + nla_total_size(4) /* RTA_PRIORITY */
2198                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2199                + nla_total_size(sizeof(struct rta_cacheinfo));
2200 }
2201
2202 static int rt6_fill_node(struct net *net,
2203                          struct sk_buff *skb, struct rt6_info *rt,
2204                          struct in6_addr *dst, struct in6_addr *src,
2205                          int iif, int type, u32 pid, u32 seq,
2206                          int prefix, int nowait, unsigned int flags)
2207 {
2208         struct rtmsg *rtm;
2209         struct nlmsghdr *nlh;
2210         long expires;
2211         u32 table;
2212
2213         if (prefix) {   /* user wants prefix routes only */
2214                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2215                         /* success since this is not a prefix route */
2216                         return 1;
2217                 }
2218         }
2219
2220         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2221         if (nlh == NULL)
2222                 return -EMSGSIZE;
2223
2224         rtm = nlmsg_data(nlh);
2225         rtm->rtm_family = AF_INET6;
2226         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2227         rtm->rtm_src_len = rt->rt6i_src.plen;
2228         rtm->rtm_tos = 0;
2229         if (rt->rt6i_table)
2230                 table = rt->rt6i_table->tb6_id;
2231         else
2232                 table = RT6_TABLE_UNSPEC;
2233         rtm->rtm_table = table;
2234         NLA_PUT_U32(skb, RTA_TABLE, table);
2235         if (rt->rt6i_flags&RTF_REJECT)
2236                 rtm->rtm_type = RTN_UNREACHABLE;
2237         else if (rt->rt6i_flags&RTF_LOCAL)
2238                 rtm->rtm_type = RTN_LOCAL;
2239         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2240                 rtm->rtm_type = RTN_LOCAL;
2241         else
2242                 rtm->rtm_type = RTN_UNICAST;
2243         rtm->rtm_flags = 0;
2244         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2245         rtm->rtm_protocol = rt->rt6i_protocol;
2246         if (rt->rt6i_flags&RTF_DYNAMIC)
2247                 rtm->rtm_protocol = RTPROT_REDIRECT;
2248         else if (rt->rt6i_flags & RTF_ADDRCONF)
2249                 rtm->rtm_protocol = RTPROT_KERNEL;
2250         else if (rt->rt6i_flags&RTF_DEFAULT)
2251                 rtm->rtm_protocol = RTPROT_RA;
2252
2253         if (rt->rt6i_flags&RTF_CACHE)
2254                 rtm->rtm_flags |= RTM_F_CLONED;
2255
2256         if (dst) {
2257                 NLA_PUT(skb, RTA_DST, 16, dst);
2258                 rtm->rtm_dst_len = 128;
2259         } else if (rtm->rtm_dst_len)
2260                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2261 #ifdef CONFIG_IPV6_SUBTREES
2262         if (src) {
2263                 NLA_PUT(skb, RTA_SRC, 16, src);
2264                 rtm->rtm_src_len = 128;
2265         } else if (rtm->rtm_src_len)
2266                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2267 #endif
2268         if (iif) {
2269 #ifdef CONFIG_IPV6_MROUTE
2270                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2271                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2272                         if (err <= 0) {
2273                                 if (!nowait) {
2274                                         if (err == 0)
2275                                                 return 0;
2276                                         goto nla_put_failure;
2277                                 } else {
2278                                         if (err == -EMSGSIZE)
2279                                                 goto nla_put_failure;
2280                                 }
2281                         }
2282                 } else
2283 #endif
2284                         NLA_PUT_U32(skb, RTA_IIF, iif);
2285         } else if (dst) {
2286                 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2287                 struct in6_addr saddr_buf;
2288                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2289                                        dst, 0, &saddr_buf) == 0)
2290                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2291         }
2292
2293         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2294                 goto nla_put_failure;
2295
2296         if (rt->dst.neighbour)
2297                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2298
2299         if (rt->dst.dev)
2300                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2301
2302         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2303
2304         if (!(rt->rt6i_flags & RTF_EXPIRES))
2305                 expires = 0;
2306         else if (rt->rt6i_expires - jiffies < INT_MAX)
2307                 expires = rt->rt6i_expires - jiffies;
2308         else
2309                 expires = INT_MAX;
2310
2311         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2312                                expires, rt->dst.error) < 0)
2313                 goto nla_put_failure;
2314
2315         return nlmsg_end(skb, nlh);
2316
2317 nla_put_failure:
2318         nlmsg_cancel(skb, nlh);
2319         return -EMSGSIZE;
2320 }
2321
2322 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2323 {
2324         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2325         int prefix;
2326
2327         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2328                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2329                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2330         } else
2331                 prefix = 0;
2332
2333         return rt6_fill_node(arg->net,
2334                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2335                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2336                      prefix, 0, NLM_F_MULTI);
2337 }
2338
2339 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2340 {
2341         struct net *net = sock_net(in_skb->sk);
2342         struct nlattr *tb[RTA_MAX+1];
2343         struct rt6_info *rt;
2344         struct sk_buff *skb;
2345         struct rtmsg *rtm;
2346         struct flowi fl;
2347         int err, iif = 0;
2348
2349         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2350         if (err < 0)
2351                 goto errout;
2352
2353         err = -EINVAL;
2354         memset(&fl, 0, sizeof(fl));
2355
2356         if (tb[RTA_SRC]) {
2357                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2358                         goto errout;
2359
2360                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2361         }
2362
2363         if (tb[RTA_DST]) {
2364                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2365                         goto errout;
2366
2367                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2368         }
2369
2370         if (tb[RTA_IIF])
2371                 iif = nla_get_u32(tb[RTA_IIF]);
2372
2373         if (tb[RTA_OIF])
2374                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2375
2376         if (iif) {
2377                 struct net_device *dev;
2378                 dev = __dev_get_by_index(net, iif);
2379                 if (!dev) {
2380                         err = -ENODEV;
2381                         goto errout;
2382                 }
2383         }
2384
2385         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2386         if (skb == NULL) {
2387                 err = -ENOBUFS;
2388                 goto errout;
2389         }
2390
2391         /* Reserve room for dummy headers, this skb can pass
2392            through good chunk of routing engine.
2393          */
2394         skb_reset_mac_header(skb);
2395         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2396
2397         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2398         skb_dst_set(skb, &rt->dst);
2399
2400         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2401                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2402                             nlh->nlmsg_seq, 0, 0, 0);
2403         if (err < 0) {
2404                 kfree_skb(skb);
2405                 goto errout;
2406         }
2407
2408         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2409 errout:
2410         return err;
2411 }
2412
2413 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2414 {
2415         struct sk_buff *skb;
2416         struct net *net = info->nl_net;
2417         u32 seq;
2418         int err;
2419
2420         err = -ENOBUFS;
2421         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2422
2423         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2424         if (skb == NULL)
2425                 goto errout;
2426
2427         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2428                                 event, info->pid, seq, 0, 0, 0);
2429         if (err < 0) {
2430                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2431                 WARN_ON(err == -EMSGSIZE);
2432                 kfree_skb(skb);
2433                 goto errout;
2434         }
2435         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2436                     info->nlh, gfp_any());
2437         return;
2438 errout:
2439         if (err < 0)
2440                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2441 }
2442
2443 static int ip6_route_dev_notify(struct notifier_block *this,
2444                                 unsigned long event, void *data)
2445 {
2446         struct net_device *dev = (struct net_device *)data;
2447         struct net *net = dev_net(dev);
2448
2449         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2450                 net->ipv6.ip6_null_entry->dst.dev = dev;
2451                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2452 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2453                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2454                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2455                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2456                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2457 #endif
2458         }
2459
2460         return NOTIFY_OK;
2461 }
2462
2463 /*
2464  *      /proc
2465  */
2466
2467 #ifdef CONFIG_PROC_FS
2468
2469 struct rt6_proc_arg
2470 {
2471         char *buffer;
2472         int offset;
2473         int length;
2474         int skip;
2475         int len;
2476 };
2477
2478 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2479 {
2480         struct seq_file *m = p_arg;
2481
2482         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2483
2484 #ifdef CONFIG_IPV6_SUBTREES
2485         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2486 #else
2487         seq_puts(m, "00000000000000000000000000000000 00 ");
2488 #endif
2489
2490         if (rt->rt6i_nexthop) {
2491                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2492         } else {
2493                 seq_puts(m, "00000000000000000000000000000000");
2494         }
2495         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2496                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2497                    rt->dst.__use, rt->rt6i_flags,
2498                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2499         return 0;
2500 }
2501
2502 static int ipv6_route_show(struct seq_file *m, void *v)
2503 {
2504         struct net *net = (struct net *)m->private;
2505         fib6_clean_all(net, rt6_info_route, 0, m);
2506         return 0;
2507 }
2508
2509 static int ipv6_route_open(struct inode *inode, struct file *file)
2510 {
2511         return single_open_net(inode, file, ipv6_route_show);
2512 }
2513
2514 static const struct file_operations ipv6_route_proc_fops = {
2515         .owner          = THIS_MODULE,
2516         .open           = ipv6_route_open,
2517         .read           = seq_read,
2518         .llseek         = seq_lseek,
2519         .release        = single_release_net,
2520 };
2521
2522 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2523 {
2524         struct net *net = (struct net *)seq->private;
2525         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2526                    net->ipv6.rt6_stats->fib_nodes,
2527                    net->ipv6.rt6_stats->fib_route_nodes,
2528                    net->ipv6.rt6_stats->fib_rt_alloc,
2529                    net->ipv6.rt6_stats->fib_rt_entries,
2530                    net->ipv6.rt6_stats->fib_rt_cache,
2531                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2532                    net->ipv6.rt6_stats->fib_discarded_routes);
2533
2534         return 0;
2535 }
2536
2537 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2538 {
2539         return single_open_net(inode, file, rt6_stats_seq_show);
2540 }
2541
2542 static const struct file_operations rt6_stats_seq_fops = {
2543         .owner   = THIS_MODULE,
2544         .open    = rt6_stats_seq_open,
2545         .read    = seq_read,
2546         .llseek  = seq_lseek,
2547         .release = single_release_net,
2548 };
2549 #endif  /* CONFIG_PROC_FS */
2550
2551 #ifdef CONFIG_SYSCTL
2552
2553 static
2554 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2555                               void __user *buffer, size_t *lenp, loff_t *ppos)
2556 {
2557         struct net *net = current->nsproxy->net_ns;
2558         int delay = net->ipv6.sysctl.flush_delay;
2559         if (write) {
2560                 proc_dointvec(ctl, write, buffer, lenp, ppos);
2561                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2562                 return 0;
2563         } else
2564                 return -EINVAL;
2565 }
2566
2567 ctl_table ipv6_route_table_template[] = {
2568         {
2569                 .procname       =       "flush",
2570                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2571                 .maxlen         =       sizeof(int),
2572                 .mode           =       0200,
2573                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2574         },
2575         {
2576                 .procname       =       "gc_thresh",
2577                 .data           =       &ip6_dst_ops_template.gc_thresh,
2578                 .maxlen         =       sizeof(int),
2579                 .mode           =       0644,
2580                 .proc_handler   =       proc_dointvec,
2581         },
2582         {
2583                 .procname       =       "max_size",
2584                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2585                 .maxlen         =       sizeof(int),
2586                 .mode           =       0644,
2587                 .proc_handler   =       proc_dointvec,
2588         },
2589         {
2590                 .procname       =       "gc_min_interval",
2591                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2592                 .maxlen         =       sizeof(int),
2593                 .mode           =       0644,
2594                 .proc_handler   =       proc_dointvec_jiffies,
2595         },
2596         {
2597                 .procname       =       "gc_timeout",
2598                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2599                 .maxlen         =       sizeof(int),
2600                 .mode           =       0644,
2601                 .proc_handler   =       proc_dointvec_jiffies,
2602         },
2603         {
2604                 .procname       =       "gc_interval",
2605                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2606                 .maxlen         =       sizeof(int),
2607                 .mode           =       0644,
2608                 .proc_handler   =       proc_dointvec_jiffies,
2609         },
2610         {
2611                 .procname       =       "gc_elasticity",
2612                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2613                 .maxlen         =       sizeof(int),
2614                 .mode           =       0644,
2615                 .proc_handler   =       proc_dointvec,
2616         },
2617         {
2618                 .procname       =       "mtu_expires",
2619                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2620                 .maxlen         =       sizeof(int),
2621                 .mode           =       0644,
2622                 .proc_handler   =       proc_dointvec_jiffies,
2623         },
2624         {
2625                 .procname       =       "min_adv_mss",
2626                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2627                 .maxlen         =       sizeof(int),
2628                 .mode           =       0644,
2629                 .proc_handler   =       proc_dointvec,
2630         },
2631         {
2632                 .procname       =       "gc_min_interval_ms",
2633                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2634                 .maxlen         =       sizeof(int),
2635                 .mode           =       0644,
2636                 .proc_handler   =       proc_dointvec_ms_jiffies,
2637         },
2638         { }
2639 };
2640
2641 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2642 {
2643         struct ctl_table *table;
2644
2645         table = kmemdup(ipv6_route_table_template,
2646                         sizeof(ipv6_route_table_template),
2647                         GFP_KERNEL);
2648
2649         if (table) {
2650                 table[0].data = &net->ipv6.sysctl.flush_delay;
2651                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2652                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2653                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2654                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2655                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2656                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2657                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2658                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2659                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2660         }
2661
2662         return table;
2663 }
2664 #endif
2665
2666 static int __net_init ip6_route_net_init(struct net *net)
2667 {
2668         int ret = -ENOMEM;
2669
2670         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2671                sizeof(net->ipv6.ip6_dst_ops));
2672
2673         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2674                 goto out_ip6_dst_ops;
2675
2676         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2677                                            sizeof(*net->ipv6.ip6_null_entry),
2678                                            GFP_KERNEL);
2679         if (!net->ipv6.ip6_null_entry)
2680                 goto out_ip6_dst_entries;
2681         net->ipv6.ip6_null_entry->dst.path =
2682                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2683         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2684         dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255);
2685
2686 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2687         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2688                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2689                                                GFP_KERNEL);
2690         if (!net->ipv6.ip6_prohibit_entry)
2691                 goto out_ip6_null_entry;
2692         net->ipv6.ip6_prohibit_entry->dst.path =
2693                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2694         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2695         dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255);
2696
2697         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2698                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2699                                                GFP_KERNEL);
2700         if (!net->ipv6.ip6_blk_hole_entry)
2701                 goto out_ip6_prohibit_entry;
2702         net->ipv6.ip6_blk_hole_entry->dst.path =
2703                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2704         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2705         dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255);
2706 #endif
2707
2708         net->ipv6.sysctl.flush_delay = 0;
2709         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2710         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2711         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2712         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2713         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2714         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2715         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2716
2717 #ifdef CONFIG_PROC_FS
2718         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2719         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2720 #endif
2721         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2722
2723         ret = 0;
2724 out:
2725         return ret;
2726
2727 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2728 out_ip6_prohibit_entry:
2729         kfree(net->ipv6.ip6_prohibit_entry);
2730 out_ip6_null_entry:
2731         kfree(net->ipv6.ip6_null_entry);
2732 #endif
2733 out_ip6_dst_entries:
2734         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2735 out_ip6_dst_ops:
2736         goto out;
2737 }
2738
2739 static void __net_exit ip6_route_net_exit(struct net *net)
2740 {
2741 #ifdef CONFIG_PROC_FS
2742         proc_net_remove(net, "ipv6_route");
2743         proc_net_remove(net, "rt6_stats");
2744 #endif
2745         kfree(net->ipv6.ip6_null_entry);
2746 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2747         kfree(net->ipv6.ip6_prohibit_entry);
2748         kfree(net->ipv6.ip6_blk_hole_entry);
2749 #endif
2750         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2751 }
2752
2753 static struct pernet_operations ip6_route_net_ops = {
2754         .init = ip6_route_net_init,
2755         .exit = ip6_route_net_exit,
2756 };
2757
2758 static struct notifier_block ip6_route_dev_notifier = {
2759         .notifier_call = ip6_route_dev_notify,
2760         .priority = 0,
2761 };
2762
2763 int __init ip6_route_init(void)
2764 {
2765         int ret;
2766
2767         ret = -ENOMEM;
2768         ip6_dst_ops_template.kmem_cachep =
2769                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2770                                   SLAB_HWCACHE_ALIGN, NULL);
2771         if (!ip6_dst_ops_template.kmem_cachep)
2772                 goto out;
2773
2774         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2775         if (ret)
2776                 goto out_kmem_cache;
2777
2778         ret = register_pernet_subsys(&ip6_route_net_ops);
2779         if (ret)
2780                 goto out_dst_entries;
2781
2782         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2783
2784         /* Registering of the loopback is done before this portion of code,
2785          * the loopback reference in rt6_info will not be taken, do it
2786          * manually for init_net */
2787         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2788         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2789   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2790         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2791         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2792         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2793         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2794   #endif
2795         ret = fib6_init();
2796         if (ret)
2797                 goto out_register_subsys;
2798
2799         ret = xfrm6_init();
2800         if (ret)
2801                 goto out_fib6_init;
2802
2803         ret = fib6_rules_init();
2804         if (ret)
2805                 goto xfrm6_init;
2806
2807         ret = -ENOBUFS;
2808         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2809             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2810             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2811                 goto fib6_rules_init;
2812
2813         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2814         if (ret)
2815                 goto fib6_rules_init;
2816
2817 out:
2818         return ret;
2819
2820 fib6_rules_init:
2821         fib6_rules_cleanup();
2822 xfrm6_init:
2823         xfrm6_fini();
2824 out_fib6_init:
2825         fib6_gc_cleanup();
2826 out_register_subsys:
2827         unregister_pernet_subsys(&ip6_route_net_ops);
2828 out_dst_entries:
2829         dst_entries_destroy(&ip6_dst_blackhole_ops);
2830 out_kmem_cache:
2831         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2832         goto out;
2833 }
2834
2835 void ip6_route_cleanup(void)
2836 {
2837         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2838         fib6_rules_cleanup();
2839         xfrm6_fini();
2840         fib6_gc_cleanup();
2841         unregister_pernet_subsys(&ip6_route_net_ops);
2842         dst_entries_destroy(&ip6_dst_blackhole_ops);
2843         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2844 }