]> git.karo-electronics.de Git - linux-beck.git/blob - net/ipv4/route.c
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec
[linux-beck.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132
133 /*
134  *      Interface to generic destination cache.
135  */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void              ipv4_link_failure(struct sk_buff *skb);
142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143                                            struct sk_buff *skb, u32 mtu);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149                             int how)
150 {
151 }
152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155         WARN_ON(1);
156         return NULL;
157 }
158
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160                                            struct sk_buff *skb,
161                                            const void *daddr);
162
163 static struct dst_ops ipv4_dst_ops = {
164         .family =               AF_INET,
165         .protocol =             cpu_to_be16(ETH_P_IP),
166         .check =                ipv4_dst_check,
167         .default_advmss =       ipv4_default_advmss,
168         .mtu =                  ipv4_mtu,
169         .cow_metrics =          ipv4_cow_metrics,
170         .destroy =              ipv4_dst_destroy,
171         .ifdown =               ipv4_dst_ifdown,
172         .negative_advice =      ipv4_negative_advice,
173         .link_failure =         ipv4_link_failure,
174         .update_pmtu =          ip_rt_update_pmtu,
175         .redirect =             ip_do_redirect,
176         .local_out =            __ip_local_out,
177         .neigh_lookup =         ipv4_neigh_lookup,
178 };
179
180 #define ECN_OR_COST(class)      TC_PRIO_##class
181
182 const __u8 ip_tos2prio[16] = {
183         TC_PRIO_BESTEFFORT,
184         ECN_OR_COST(BESTEFFORT),
185         TC_PRIO_BESTEFFORT,
186         ECN_OR_COST(BESTEFFORT),
187         TC_PRIO_BULK,
188         ECN_OR_COST(BULK),
189         TC_PRIO_BULK,
190         ECN_OR_COST(BULK),
191         TC_PRIO_INTERACTIVE,
192         ECN_OR_COST(INTERACTIVE),
193         TC_PRIO_INTERACTIVE,
194         ECN_OR_COST(INTERACTIVE),
195         TC_PRIO_INTERACTIVE_BULK,
196         ECN_OR_COST(INTERACTIVE_BULK),
197         TC_PRIO_INTERACTIVE_BULK,
198         ECN_OR_COST(INTERACTIVE_BULK)
199 };
200 EXPORT_SYMBOL(ip_tos2prio);
201
202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204
205 #ifdef CONFIG_PROC_FS
206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207 {
208         if (*pos)
209                 return NULL;
210         return SEQ_START_TOKEN;
211 }
212
213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214 {
215         ++*pos;
216         return NULL;
217 }
218
219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220 {
221 }
222
223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
224 {
225         if (v == SEQ_START_TOKEN)
226                 seq_printf(seq, "%-127s\n",
227                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229                            "HHUptod\tSpecDst");
230         return 0;
231 }
232
233 static const struct seq_operations rt_cache_seq_ops = {
234         .start  = rt_cache_seq_start,
235         .next   = rt_cache_seq_next,
236         .stop   = rt_cache_seq_stop,
237         .show   = rt_cache_seq_show,
238 };
239
240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
241 {
242         return seq_open(file, &rt_cache_seq_ops);
243 }
244
245 static const struct file_operations rt_cache_seq_fops = {
246         .owner   = THIS_MODULE,
247         .open    = rt_cache_seq_open,
248         .read    = seq_read,
249         .llseek  = seq_lseek,
250         .release = seq_release,
251 };
252
253
254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255 {
256         int cpu;
257
258         if (*pos == 0)
259                 return SEQ_START_TOKEN;
260
261         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
262                 if (!cpu_possible(cpu))
263                         continue;
264                 *pos = cpu+1;
265                 return &per_cpu(rt_cache_stat, cpu);
266         }
267         return NULL;
268 }
269
270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 {
272         int cpu;
273
274         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
275                 if (!cpu_possible(cpu))
276                         continue;
277                 *pos = cpu+1;
278                 return &per_cpu(rt_cache_stat, cpu);
279         }
280         return NULL;
281
282 }
283
284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285 {
286
287 }
288
289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290 {
291         struct rt_cache_stat *st = v;
292
293         if (v == SEQ_START_TOKEN) {
294                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295                 return 0;
296         }
297
298         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
299                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300                    dst_entries_get_slow(&ipv4_dst_ops),
301                    st->in_hit,
302                    st->in_slow_tot,
303                    st->in_slow_mc,
304                    st->in_no_route,
305                    st->in_brd,
306                    st->in_martian_dst,
307                    st->in_martian_src,
308
309                    st->out_hit,
310                    st->out_slow_tot,
311                    st->out_slow_mc,
312
313                    st->gc_total,
314                    st->gc_ignored,
315                    st->gc_goal_miss,
316                    st->gc_dst_overflow,
317                    st->in_hlist_search,
318                    st->out_hlist_search
319                 );
320         return 0;
321 }
322
323 static const struct seq_operations rt_cpu_seq_ops = {
324         .start  = rt_cpu_seq_start,
325         .next   = rt_cpu_seq_next,
326         .stop   = rt_cpu_seq_stop,
327         .show   = rt_cpu_seq_show,
328 };
329
330
331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332 {
333         return seq_open(file, &rt_cpu_seq_ops);
334 }
335
336 static const struct file_operations rt_cpu_seq_fops = {
337         .owner   = THIS_MODULE,
338         .open    = rt_cpu_seq_open,
339         .read    = seq_read,
340         .llseek  = seq_lseek,
341         .release = seq_release,
342 };
343
344 #ifdef CONFIG_IP_ROUTE_CLASSID
345 static int rt_acct_proc_show(struct seq_file *m, void *v)
346 {
347         struct ip_rt_acct *dst, *src;
348         unsigned int i, j;
349
350         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351         if (!dst)
352                 return -ENOMEM;
353
354         for_each_possible_cpu(i) {
355                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356                 for (j = 0; j < 256; j++) {
357                         dst[j].o_bytes   += src[j].o_bytes;
358                         dst[j].o_packets += src[j].o_packets;
359                         dst[j].i_bytes   += src[j].i_bytes;
360                         dst[j].i_packets += src[j].i_packets;
361                 }
362         }
363
364         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365         kfree(dst);
366         return 0;
367 }
368
369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
370 {
371         return single_open(file, rt_acct_proc_show, NULL);
372 }
373
374 static const struct file_operations rt_acct_proc_fops = {
375         .owner          = THIS_MODULE,
376         .open           = rt_acct_proc_open,
377         .read           = seq_read,
378         .llseek         = seq_lseek,
379         .release        = single_release,
380 };
381 #endif
382
383 static int __net_init ip_rt_do_proc_init(struct net *net)
384 {
385         struct proc_dir_entry *pde;
386
387         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
388                         &rt_cache_seq_fops);
389         if (!pde)
390                 goto err1;
391
392         pde = proc_create("rt_cache", S_IRUGO,
393                           net->proc_net_stat, &rt_cpu_seq_fops);
394         if (!pde)
395                 goto err2;
396
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399         if (!pde)
400                 goto err3;
401 #endif
402         return 0;
403
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 err3:
406         remove_proc_entry("rt_cache", net->proc_net_stat);
407 #endif
408 err2:
409         remove_proc_entry("rt_cache", net->proc_net);
410 err1:
411         return -ENOMEM;
412 }
413
414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
415 {
416         remove_proc_entry("rt_cache", net->proc_net_stat);
417         remove_proc_entry("rt_cache", net->proc_net);
418 #ifdef CONFIG_IP_ROUTE_CLASSID
419         remove_proc_entry("rt_acct", net->proc_net);
420 #endif
421 }
422
423 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
424         .init = ip_rt_do_proc_init,
425         .exit = ip_rt_do_proc_exit,
426 };
427
428 static int __init ip_rt_proc_init(void)
429 {
430         return register_pernet_subsys(&ip_rt_proc_ops);
431 }
432
433 #else
434 static inline int ip_rt_proc_init(void)
435 {
436         return 0;
437 }
438 #endif /* CONFIG_PROC_FS */
439
440 static inline bool rt_is_expired(const struct rtable *rth)
441 {
442         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
443 }
444
445 void rt_cache_flush(struct net *net)
446 {
447         rt_genid_bump(net);
448 }
449
450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451                                            struct sk_buff *skb,
452                                            const void *daddr)
453 {
454         struct net_device *dev = dst->dev;
455         const __be32 *pkey = daddr;
456         const struct rtable *rt;
457         struct neighbour *n;
458
459         rt = (const struct rtable *) dst;
460         if (rt->rt_gateway)
461                 pkey = (const __be32 *) &rt->rt_gateway;
462         else if (skb)
463                 pkey = &ip_hdr(skb)->daddr;
464
465         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
466         if (n)
467                 return n;
468         return neigh_create(&arp_tbl, pkey, dev);
469 }
470
471 /*
472  * Peer allocation may fail only in serious out-of-memory conditions.  However
473  * we still can generate some output.
474  * Random ID selection looks a bit dangerous because we have no chances to
475  * select ID being unique in a reasonable period of time.
476  * But broken packet identifier may be better than no packet at all.
477  */
478 static void ip_select_fb_ident(struct iphdr *iph)
479 {
480         static DEFINE_SPINLOCK(ip_fb_id_lock);
481         static u32 ip_fallback_id;
482         u32 salt;
483
484         spin_lock_bh(&ip_fb_id_lock);
485         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
486         iph->id = htons(salt & 0xFFFF);
487         ip_fallback_id = salt;
488         spin_unlock_bh(&ip_fb_id_lock);
489 }
490
491 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
492 {
493         struct net *net = dev_net(dst->dev);
494         struct inet_peer *peer;
495
496         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
497         if (peer) {
498                 iph->id = htons(inet_getid(peer, more));
499                 inet_putpeer(peer);
500                 return;
501         }
502
503         ip_select_fb_ident(iph);
504 }
505 EXPORT_SYMBOL(__ip_select_ident);
506
507 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
508                              const struct iphdr *iph,
509                              int oif, u8 tos,
510                              u8 prot, u32 mark, int flow_flags)
511 {
512         if (sk) {
513                 const struct inet_sock *inet = inet_sk(sk);
514
515                 oif = sk->sk_bound_dev_if;
516                 mark = sk->sk_mark;
517                 tos = RT_CONN_FLAGS(sk);
518                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
519         }
520         flowi4_init_output(fl4, oif, mark, tos,
521                            RT_SCOPE_UNIVERSE, prot,
522                            flow_flags,
523                            iph->daddr, iph->saddr, 0, 0);
524 }
525
526 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
527                                const struct sock *sk)
528 {
529         const struct iphdr *iph = ip_hdr(skb);
530         int oif = skb->dev->ifindex;
531         u8 tos = RT_TOS(iph->tos);
532         u8 prot = iph->protocol;
533         u32 mark = skb->mark;
534
535         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
536 }
537
538 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
539 {
540         const struct inet_sock *inet = inet_sk(sk);
541         const struct ip_options_rcu *inet_opt;
542         __be32 daddr = inet->inet_daddr;
543
544         rcu_read_lock();
545         inet_opt = rcu_dereference(inet->inet_opt);
546         if (inet_opt && inet_opt->opt.srr)
547                 daddr = inet_opt->opt.faddr;
548         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
549                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
550                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
551                            inet_sk_flowi_flags(sk),
552                            daddr, inet->inet_saddr, 0, 0);
553         rcu_read_unlock();
554 }
555
556 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
557                                  const struct sk_buff *skb)
558 {
559         if (skb)
560                 build_skb_flow_key(fl4, skb, sk);
561         else
562                 build_sk_flow_key(fl4, sk);
563 }
564
565 static inline void rt_free(struct rtable *rt)
566 {
567         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
568 }
569
570 static DEFINE_SPINLOCK(fnhe_lock);
571
572 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
573 {
574         struct fib_nh_exception *fnhe, *oldest;
575         struct rtable *orig;
576
577         oldest = rcu_dereference(hash->chain);
578         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
579              fnhe = rcu_dereference(fnhe->fnhe_next)) {
580                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
581                         oldest = fnhe;
582         }
583         orig = rcu_dereference(oldest->fnhe_rth);
584         if (orig) {
585                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
586                 rt_free(orig);
587         }
588         return oldest;
589 }
590
591 static inline u32 fnhe_hashfun(__be32 daddr)
592 {
593         u32 hval;
594
595         hval = (__force u32) daddr;
596         hval ^= (hval >> 11) ^ (hval >> 22);
597
598         return hval & (FNHE_HASH_SIZE - 1);
599 }
600
601 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
602                                   u32 pmtu, unsigned long expires)
603 {
604         struct fnhe_hash_bucket *hash;
605         struct fib_nh_exception *fnhe;
606         int depth;
607         u32 hval = fnhe_hashfun(daddr);
608
609         spin_lock_bh(&fnhe_lock);
610
611         hash = nh->nh_exceptions;
612         if (!hash) {
613                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
614                 if (!hash)
615                         goto out_unlock;
616                 nh->nh_exceptions = hash;
617         }
618
619         hash += hval;
620
621         depth = 0;
622         for (fnhe = rcu_dereference(hash->chain); fnhe;
623              fnhe = rcu_dereference(fnhe->fnhe_next)) {
624                 if (fnhe->fnhe_daddr == daddr)
625                         break;
626                 depth++;
627         }
628
629         if (fnhe) {
630                 if (gw)
631                         fnhe->fnhe_gw = gw;
632                 if (pmtu) {
633                         fnhe->fnhe_pmtu = pmtu;
634                         fnhe->fnhe_expires = expires;
635                 }
636         } else {
637                 if (depth > FNHE_RECLAIM_DEPTH)
638                         fnhe = fnhe_oldest(hash);
639                 else {
640                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
641                         if (!fnhe)
642                                 goto out_unlock;
643
644                         fnhe->fnhe_next = hash->chain;
645                         rcu_assign_pointer(hash->chain, fnhe);
646                 }
647                 fnhe->fnhe_daddr = daddr;
648                 fnhe->fnhe_gw = gw;
649                 fnhe->fnhe_pmtu = pmtu;
650                 fnhe->fnhe_expires = expires;
651         }
652
653         fnhe->fnhe_stamp = jiffies;
654
655 out_unlock:
656         spin_unlock_bh(&fnhe_lock);
657         return;
658 }
659
660 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
661                              bool kill_route)
662 {
663         __be32 new_gw = icmp_hdr(skb)->un.gateway;
664         __be32 old_gw = ip_hdr(skb)->saddr;
665         struct net_device *dev = skb->dev;
666         struct in_device *in_dev;
667         struct fib_result res;
668         struct neighbour *n;
669         struct net *net;
670
671         switch (icmp_hdr(skb)->code & 7) {
672         case ICMP_REDIR_NET:
673         case ICMP_REDIR_NETTOS:
674         case ICMP_REDIR_HOST:
675         case ICMP_REDIR_HOSTTOS:
676                 break;
677
678         default:
679                 return;
680         }
681
682         if (rt->rt_gateway != old_gw)
683                 return;
684
685         in_dev = __in_dev_get_rcu(dev);
686         if (!in_dev)
687                 return;
688
689         net = dev_net(dev);
690         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
691             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
692             ipv4_is_zeronet(new_gw))
693                 goto reject_redirect;
694
695         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
696                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
697                         goto reject_redirect;
698                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
699                         goto reject_redirect;
700         } else {
701                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
702                         goto reject_redirect;
703         }
704
705         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
706         if (n) {
707                 if (!(n->nud_state & NUD_VALID)) {
708                         neigh_event_send(n, NULL);
709                 } else {
710                         if (fib_lookup(net, fl4, &res) == 0) {
711                                 struct fib_nh *nh = &FIB_RES_NH(res);
712
713                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
714                                                       0, 0);
715                         }
716                         if (kill_route)
717                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
718                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
719                 }
720                 neigh_release(n);
721         }
722         return;
723
724 reject_redirect:
725 #ifdef CONFIG_IP_ROUTE_VERBOSE
726         if (IN_DEV_LOG_MARTIANS(in_dev)) {
727                 const struct iphdr *iph = (const struct iphdr *) skb->data;
728                 __be32 daddr = iph->daddr;
729                 __be32 saddr = iph->saddr;
730
731                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
732                                      "  Advised path = %pI4 -> %pI4\n",
733                                      &old_gw, dev->name, &new_gw,
734                                      &saddr, &daddr);
735         }
736 #endif
737         ;
738 }
739
740 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
741 {
742         struct rtable *rt;
743         struct flowi4 fl4;
744
745         rt = (struct rtable *) dst;
746
747         ip_rt_build_flow_key(&fl4, sk, skb);
748         __ip_do_redirect(rt, skb, &fl4, true);
749 }
750
751 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
752 {
753         struct rtable *rt = (struct rtable *)dst;
754         struct dst_entry *ret = dst;
755
756         if (rt) {
757                 if (dst->obsolete > 0) {
758                         ip_rt_put(rt);
759                         ret = NULL;
760                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
761                            rt->dst.expires) {
762                         ip_rt_put(rt);
763                         ret = NULL;
764                 }
765         }
766         return ret;
767 }
768
769 /*
770  * Algorithm:
771  *      1. The first ip_rt_redirect_number redirects are sent
772  *         with exponential backoff, then we stop sending them at all,
773  *         assuming that the host ignores our redirects.
774  *      2. If we did not see packets requiring redirects
775  *         during ip_rt_redirect_silence, we assume that the host
776  *         forgot redirected route and start to send redirects again.
777  *
778  * This algorithm is much cheaper and more intelligent than dumb load limiting
779  * in icmp.c.
780  *
781  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
782  * and "frag. need" (breaks PMTU discovery) in icmp.c.
783  */
784
785 void ip_rt_send_redirect(struct sk_buff *skb)
786 {
787         struct rtable *rt = skb_rtable(skb);
788         struct in_device *in_dev;
789         struct inet_peer *peer;
790         struct net *net;
791         int log_martians;
792
793         rcu_read_lock();
794         in_dev = __in_dev_get_rcu(rt->dst.dev);
795         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
796                 rcu_read_unlock();
797                 return;
798         }
799         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
800         rcu_read_unlock();
801
802         net = dev_net(rt->dst.dev);
803         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
804         if (!peer) {
805                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
806                           rt_nexthop(rt, ip_hdr(skb)->daddr));
807                 return;
808         }
809
810         /* No redirected packets during ip_rt_redirect_silence;
811          * reset the algorithm.
812          */
813         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
814                 peer->rate_tokens = 0;
815
816         /* Too many ignored redirects; do not send anything
817          * set dst.rate_last to the last seen redirected packet.
818          */
819         if (peer->rate_tokens >= ip_rt_redirect_number) {
820                 peer->rate_last = jiffies;
821                 goto out_put_peer;
822         }
823
824         /* Check for load limit; set rate_last to the latest sent
825          * redirect.
826          */
827         if (peer->rate_tokens == 0 ||
828             time_after(jiffies,
829                        (peer->rate_last +
830                         (ip_rt_redirect_load << peer->rate_tokens)))) {
831                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
832
833                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
834                 peer->rate_last = jiffies;
835                 ++peer->rate_tokens;
836 #ifdef CONFIG_IP_ROUTE_VERBOSE
837                 if (log_martians &&
838                     peer->rate_tokens == ip_rt_redirect_number)
839                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
840                                              &ip_hdr(skb)->saddr, inet_iif(skb),
841                                              &ip_hdr(skb)->daddr, &gw);
842 #endif
843         }
844 out_put_peer:
845         inet_putpeer(peer);
846 }
847
848 static int ip_error(struct sk_buff *skb)
849 {
850         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
851         struct rtable *rt = skb_rtable(skb);
852         struct inet_peer *peer;
853         unsigned long now;
854         struct net *net;
855         bool send;
856         int code;
857
858         net = dev_net(rt->dst.dev);
859         if (!IN_DEV_FORWARD(in_dev)) {
860                 switch (rt->dst.error) {
861                 case EHOSTUNREACH:
862                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
863                         break;
864
865                 case ENETUNREACH:
866                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
867                         break;
868                 }
869                 goto out;
870         }
871
872         switch (rt->dst.error) {
873         case EINVAL:
874         default:
875                 goto out;
876         case EHOSTUNREACH:
877                 code = ICMP_HOST_UNREACH;
878                 break;
879         case ENETUNREACH:
880                 code = ICMP_NET_UNREACH;
881                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
882                 break;
883         case EACCES:
884                 code = ICMP_PKT_FILTERED;
885                 break;
886         }
887
888         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
889
890         send = true;
891         if (peer) {
892                 now = jiffies;
893                 peer->rate_tokens += now - peer->rate_last;
894                 if (peer->rate_tokens > ip_rt_error_burst)
895                         peer->rate_tokens = ip_rt_error_burst;
896                 peer->rate_last = now;
897                 if (peer->rate_tokens >= ip_rt_error_cost)
898                         peer->rate_tokens -= ip_rt_error_cost;
899                 else
900                         send = false;
901                 inet_putpeer(peer);
902         }
903         if (send)
904                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
905
906 out:    kfree_skb(skb);
907         return 0;
908 }
909
910 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
911 {
912         struct dst_entry *dst = &rt->dst;
913         struct fib_result res;
914
915         if (dst_metric_locked(dst, RTAX_MTU))
916                 return;
917
918         if (dst->dev->mtu < mtu)
919                 return;
920
921         if (mtu < ip_rt_min_pmtu)
922                 mtu = ip_rt_min_pmtu;
923
924         if (!rt->rt_pmtu) {
925                 dst->obsolete = DST_OBSOLETE_KILL;
926         } else {
927                 rt->rt_pmtu = mtu;
928                 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
929         }
930
931         rcu_read_lock();
932         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
933                 struct fib_nh *nh = &FIB_RES_NH(res);
934
935                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
936                                       jiffies + ip_rt_mtu_expires);
937         }
938         rcu_read_unlock();
939 }
940
941 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
942                               struct sk_buff *skb, u32 mtu)
943 {
944         struct rtable *rt = (struct rtable *) dst;
945         struct flowi4 fl4;
946
947         ip_rt_build_flow_key(&fl4, sk, skb);
948         __ip_rt_update_pmtu(rt, &fl4, mtu);
949 }
950
951 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
952                       int oif, u32 mark, u8 protocol, int flow_flags)
953 {
954         const struct iphdr *iph = (const struct iphdr *) skb->data;
955         struct flowi4 fl4;
956         struct rtable *rt;
957
958         __build_flow_key(&fl4, NULL, iph, oif,
959                          RT_TOS(iph->tos), protocol, mark, flow_flags);
960         rt = __ip_route_output_key(net, &fl4);
961         if (!IS_ERR(rt)) {
962                 __ip_rt_update_pmtu(rt, &fl4, mtu);
963                 ip_rt_put(rt);
964         }
965 }
966 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
967
968 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
969 {
970         const struct iphdr *iph = (const struct iphdr *) skb->data;
971         struct flowi4 fl4;
972         struct rtable *rt;
973
974         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
975         rt = __ip_route_output_key(sock_net(sk), &fl4);
976         if (!IS_ERR(rt)) {
977                 __ip_rt_update_pmtu(rt, &fl4, mtu);
978                 ip_rt_put(rt);
979         }
980 }
981
982 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
983 {
984         const struct iphdr *iph = (const struct iphdr *) skb->data;
985         struct flowi4 fl4;
986         struct rtable *rt;
987         struct dst_entry *dst;
988
989         bh_lock_sock(sk);
990         rt = (struct rtable *) __sk_dst_get(sk);
991
992         if (sock_owned_by_user(sk) || !rt) {
993                 __ipv4_sk_update_pmtu(skb, sk, mtu);
994                 goto out;
995         }
996
997         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
998
999         if (!__sk_dst_check(sk, 0)) {
1000                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1001                 if (IS_ERR(rt))
1002                         goto out;
1003         }
1004
1005         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1006
1007         dst = dst_check(&rt->dst, 0);
1008         if (!dst) {
1009                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1010                 if (IS_ERR(rt))
1011                         goto out;
1012
1013                 dst = &rt->dst;
1014         }
1015
1016          __sk_dst_set(sk, dst);
1017
1018 out:
1019         bh_unlock_sock(sk);
1020 }
1021 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1022
1023 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1024                    int oif, u32 mark, u8 protocol, int flow_flags)
1025 {
1026         const struct iphdr *iph = (const struct iphdr *) skb->data;
1027         struct flowi4 fl4;
1028         struct rtable *rt;
1029
1030         __build_flow_key(&fl4, NULL, iph, oif,
1031                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1032         rt = __ip_route_output_key(net, &fl4);
1033         if (!IS_ERR(rt)) {
1034                 __ip_do_redirect(rt, skb, &fl4, false);
1035                 ip_rt_put(rt);
1036         }
1037 }
1038 EXPORT_SYMBOL_GPL(ipv4_redirect);
1039
1040 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1041 {
1042         const struct iphdr *iph = (const struct iphdr *) skb->data;
1043         struct flowi4 fl4;
1044         struct rtable *rt;
1045
1046         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1047         rt = __ip_route_output_key(sock_net(sk), &fl4);
1048         if (!IS_ERR(rt)) {
1049                 __ip_do_redirect(rt, skb, &fl4, false);
1050                 ip_rt_put(rt);
1051         }
1052 }
1053 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1054
1055 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1056 {
1057         struct rtable *rt = (struct rtable *) dst;
1058
1059         /* All IPV4 dsts are created with ->obsolete set to the value
1060          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1061          * into this function always.
1062          *
1063          * When a PMTU/redirect information update invalidates a
1064          * route, this is indicated by setting obsolete to
1065          * DST_OBSOLETE_KILL.
1066          */
1067         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1068                 return NULL;
1069         return dst;
1070 }
1071
1072 static void ipv4_link_failure(struct sk_buff *skb)
1073 {
1074         struct rtable *rt;
1075
1076         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1077
1078         rt = skb_rtable(skb);
1079         if (rt)
1080                 dst_set_expires(&rt->dst, 0);
1081 }
1082
1083 static int ip_rt_bug(struct sk_buff *skb)
1084 {
1085         pr_debug("%s: %pI4 -> %pI4, %s\n",
1086                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1087                  skb->dev ? skb->dev->name : "?");
1088         kfree_skb(skb);
1089         WARN_ON(1);
1090         return 0;
1091 }
1092
1093 /*
1094    We do not cache source address of outgoing interface,
1095    because it is used only by IP RR, TS and SRR options,
1096    so that it out of fast path.
1097
1098    BTW remember: "addr" is allowed to be not aligned
1099    in IP options!
1100  */
1101
1102 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1103 {
1104         __be32 src;
1105
1106         if (rt_is_output_route(rt))
1107                 src = ip_hdr(skb)->saddr;
1108         else {
1109                 struct fib_result res;
1110                 struct flowi4 fl4;
1111                 struct iphdr *iph;
1112
1113                 iph = ip_hdr(skb);
1114
1115                 memset(&fl4, 0, sizeof(fl4));
1116                 fl4.daddr = iph->daddr;
1117                 fl4.saddr = iph->saddr;
1118                 fl4.flowi4_tos = RT_TOS(iph->tos);
1119                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1120                 fl4.flowi4_iif = skb->dev->ifindex;
1121                 fl4.flowi4_mark = skb->mark;
1122
1123                 rcu_read_lock();
1124                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1125                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1126                 else
1127                         src = inet_select_addr(rt->dst.dev,
1128                                                rt_nexthop(rt, iph->daddr),
1129                                                RT_SCOPE_UNIVERSE);
1130                 rcu_read_unlock();
1131         }
1132         memcpy(addr, &src, 4);
1133 }
1134
1135 #ifdef CONFIG_IP_ROUTE_CLASSID
1136 static void set_class_tag(struct rtable *rt, u32 tag)
1137 {
1138         if (!(rt->dst.tclassid & 0xFFFF))
1139                 rt->dst.tclassid |= tag & 0xFFFF;
1140         if (!(rt->dst.tclassid & 0xFFFF0000))
1141                 rt->dst.tclassid |= tag & 0xFFFF0000;
1142 }
1143 #endif
1144
1145 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1146 {
1147         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1148
1149         if (advmss == 0) {
1150                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1151                                ip_rt_min_advmss);
1152                 if (advmss > 65535 - 40)
1153                         advmss = 65535 - 40;
1154         }
1155         return advmss;
1156 }
1157
1158 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1159 {
1160         const struct rtable *rt = (const struct rtable *) dst;
1161         unsigned int mtu = rt->rt_pmtu;
1162
1163         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1164                 mtu = dst_metric_raw(dst, RTAX_MTU);
1165
1166         if (mtu)
1167                 return mtu;
1168
1169         mtu = dst->dev->mtu;
1170
1171         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1172                 if (rt->rt_uses_gateway && mtu > 576)
1173                         mtu = 576;
1174         }
1175
1176         if (mtu > IP_MAX_MTU)
1177                 mtu = IP_MAX_MTU;
1178
1179         return mtu;
1180 }
1181
1182 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1183 {
1184         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1185         struct fib_nh_exception *fnhe;
1186         u32 hval;
1187
1188         if (!hash)
1189                 return NULL;
1190
1191         hval = fnhe_hashfun(daddr);
1192
1193         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1194              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1195                 if (fnhe->fnhe_daddr == daddr)
1196                         return fnhe;
1197         }
1198         return NULL;
1199 }
1200
1201 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1202                               __be32 daddr)
1203 {
1204         bool ret = false;
1205
1206         spin_lock_bh(&fnhe_lock);
1207
1208         if (daddr == fnhe->fnhe_daddr) {
1209                 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1210                 if (orig && rt_is_expired(orig)) {
1211                         fnhe->fnhe_gw = 0;
1212                         fnhe->fnhe_pmtu = 0;
1213                         fnhe->fnhe_expires = 0;
1214                 }
1215                 if (fnhe->fnhe_pmtu) {
1216                         unsigned long expires = fnhe->fnhe_expires;
1217                         unsigned long diff = expires - jiffies;
1218
1219                         if (time_before(jiffies, expires)) {
1220                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1221                                 dst_set_expires(&rt->dst, diff);
1222                         }
1223                 }
1224                 if (fnhe->fnhe_gw) {
1225                         rt->rt_flags |= RTCF_REDIRECTED;
1226                         rt->rt_gateway = fnhe->fnhe_gw;
1227                         rt->rt_uses_gateway = 1;
1228                 } else if (!rt->rt_gateway)
1229                         rt->rt_gateway = daddr;
1230
1231                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1232                 if (orig)
1233                         rt_free(orig);
1234
1235                 fnhe->fnhe_stamp = jiffies;
1236                 ret = true;
1237         }
1238         spin_unlock_bh(&fnhe_lock);
1239
1240         return ret;
1241 }
1242
1243 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1244 {
1245         struct rtable *orig, *prev, **p;
1246         bool ret = true;
1247
1248         if (rt_is_input_route(rt)) {
1249                 p = (struct rtable **)&nh->nh_rth_input;
1250         } else {
1251                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1252         }
1253         orig = *p;
1254
1255         prev = cmpxchg(p, orig, rt);
1256         if (prev == orig) {
1257                 if (orig)
1258                         rt_free(orig);
1259         } else
1260                 ret = false;
1261
1262         return ret;
1263 }
1264
1265 static DEFINE_SPINLOCK(rt_uncached_lock);
1266 static LIST_HEAD(rt_uncached_list);
1267
1268 static void rt_add_uncached_list(struct rtable *rt)
1269 {
1270         spin_lock_bh(&rt_uncached_lock);
1271         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1272         spin_unlock_bh(&rt_uncached_lock);
1273 }
1274
1275 static void ipv4_dst_destroy(struct dst_entry *dst)
1276 {
1277         struct rtable *rt = (struct rtable *) dst;
1278
1279         if (!list_empty(&rt->rt_uncached)) {
1280                 spin_lock_bh(&rt_uncached_lock);
1281                 list_del(&rt->rt_uncached);
1282                 spin_unlock_bh(&rt_uncached_lock);
1283         }
1284 }
1285
1286 void rt_flush_dev(struct net_device *dev)
1287 {
1288         if (!list_empty(&rt_uncached_list)) {
1289                 struct net *net = dev_net(dev);
1290                 struct rtable *rt;
1291
1292                 spin_lock_bh(&rt_uncached_lock);
1293                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1294                         if (rt->dst.dev != dev)
1295                                 continue;
1296                         rt->dst.dev = net->loopback_dev;
1297                         dev_hold(rt->dst.dev);
1298                         dev_put(dev);
1299                 }
1300                 spin_unlock_bh(&rt_uncached_lock);
1301         }
1302 }
1303
1304 static bool rt_cache_valid(const struct rtable *rt)
1305 {
1306         return  rt &&
1307                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1308                 !rt_is_expired(rt);
1309 }
1310
1311 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1312                            const struct fib_result *res,
1313                            struct fib_nh_exception *fnhe,
1314                            struct fib_info *fi, u16 type, u32 itag)
1315 {
1316         bool cached = false;
1317
1318         if (fi) {
1319                 struct fib_nh *nh = &FIB_RES_NH(*res);
1320
1321                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1322                         rt->rt_gateway = nh->nh_gw;
1323                         rt->rt_uses_gateway = 1;
1324                 }
1325                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1326 #ifdef CONFIG_IP_ROUTE_CLASSID
1327                 rt->dst.tclassid = nh->nh_tclassid;
1328 #endif
1329                 if (unlikely(fnhe))
1330                         cached = rt_bind_exception(rt, fnhe, daddr);
1331                 else if (!(rt->dst.flags & DST_NOCACHE))
1332                         cached = rt_cache_route(nh, rt);
1333                 if (unlikely(!cached)) {
1334                         /* Routes we intend to cache in nexthop exception or
1335                          * FIB nexthop have the DST_NOCACHE bit clear.
1336                          * However, if we are unsuccessful at storing this
1337                          * route into the cache we really need to set it.
1338                          */
1339                         rt->dst.flags |= DST_NOCACHE;
1340                         if (!rt->rt_gateway)
1341                                 rt->rt_gateway = daddr;
1342                         rt_add_uncached_list(rt);
1343                 }
1344         } else
1345                 rt_add_uncached_list(rt);
1346
1347 #ifdef CONFIG_IP_ROUTE_CLASSID
1348 #ifdef CONFIG_IP_MULTIPLE_TABLES
1349         set_class_tag(rt, res->tclassid);
1350 #endif
1351         set_class_tag(rt, itag);
1352 #endif
1353 }
1354
1355 static struct rtable *rt_dst_alloc(struct net_device *dev,
1356                                    bool nopolicy, bool noxfrm, bool will_cache)
1357 {
1358         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1359                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1360                          (nopolicy ? DST_NOPOLICY : 0) |
1361                          (noxfrm ? DST_NOXFRM : 0));
1362 }
1363
1364 /* called in rcu_read_lock() section */
1365 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1366                                 u8 tos, struct net_device *dev, int our)
1367 {
1368         struct rtable *rth;
1369         struct in_device *in_dev = __in_dev_get_rcu(dev);
1370         u32 itag = 0;
1371         int err;
1372
1373         /* Primary sanity checks. */
1374
1375         if (in_dev == NULL)
1376                 return -EINVAL;
1377
1378         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1379             skb->protocol != htons(ETH_P_IP))
1380                 goto e_inval;
1381
1382         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1383                 if (ipv4_is_loopback(saddr))
1384                         goto e_inval;
1385
1386         if (ipv4_is_zeronet(saddr)) {
1387                 if (!ipv4_is_local_multicast(daddr))
1388                         goto e_inval;
1389         } else {
1390                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1391                                           in_dev, &itag);
1392                 if (err < 0)
1393                         goto e_err;
1394         }
1395         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1396                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1397         if (!rth)
1398                 goto e_nobufs;
1399
1400 #ifdef CONFIG_IP_ROUTE_CLASSID
1401         rth->dst.tclassid = itag;
1402 #endif
1403         rth->dst.output = ip_rt_bug;
1404
1405         rth->rt_genid   = rt_genid(dev_net(dev));
1406         rth->rt_flags   = RTCF_MULTICAST;
1407         rth->rt_type    = RTN_MULTICAST;
1408         rth->rt_is_input= 1;
1409         rth->rt_iif     = 0;
1410         rth->rt_pmtu    = 0;
1411         rth->rt_gateway = 0;
1412         rth->rt_uses_gateway = 0;
1413         INIT_LIST_HEAD(&rth->rt_uncached);
1414         if (our) {
1415                 rth->dst.input= ip_local_deliver;
1416                 rth->rt_flags |= RTCF_LOCAL;
1417         }
1418
1419 #ifdef CONFIG_IP_MROUTE
1420         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1421                 rth->dst.input = ip_mr_input;
1422 #endif
1423         RT_CACHE_STAT_INC(in_slow_mc);
1424
1425         skb_dst_set(skb, &rth->dst);
1426         return 0;
1427
1428 e_nobufs:
1429         return -ENOBUFS;
1430 e_inval:
1431         return -EINVAL;
1432 e_err:
1433         return err;
1434 }
1435
1436
1437 static void ip_handle_martian_source(struct net_device *dev,
1438                                      struct in_device *in_dev,
1439                                      struct sk_buff *skb,
1440                                      __be32 daddr,
1441                                      __be32 saddr)
1442 {
1443         RT_CACHE_STAT_INC(in_martian_src);
1444 #ifdef CONFIG_IP_ROUTE_VERBOSE
1445         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1446                 /*
1447                  *      RFC1812 recommendation, if source is martian,
1448                  *      the only hint is MAC header.
1449                  */
1450                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1451                         &daddr, &saddr, dev->name);
1452                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1453                         print_hex_dump(KERN_WARNING, "ll header: ",
1454                                        DUMP_PREFIX_OFFSET, 16, 1,
1455                                        skb_mac_header(skb),
1456                                        dev->hard_header_len, true);
1457                 }
1458         }
1459 #endif
1460 }
1461
1462 /* called in rcu_read_lock() section */
1463 static int __mkroute_input(struct sk_buff *skb,
1464                            const struct fib_result *res,
1465                            struct in_device *in_dev,
1466                            __be32 daddr, __be32 saddr, u32 tos)
1467 {
1468         struct rtable *rth;
1469         int err;
1470         struct in_device *out_dev;
1471         unsigned int flags = 0;
1472         bool do_cache;
1473         u32 itag;
1474
1475         /* get a working reference to the output device */
1476         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1477         if (out_dev == NULL) {
1478                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1479                 return -EINVAL;
1480         }
1481
1482         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1483                                   in_dev->dev, in_dev, &itag);
1484         if (err < 0) {
1485                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1486                                          saddr);
1487
1488                 goto cleanup;
1489         }
1490
1491         do_cache = res->fi && !itag;
1492         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1493             (IN_DEV_SHARED_MEDIA(out_dev) ||
1494              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1495                 flags |= RTCF_DOREDIRECT;
1496                 do_cache = false;
1497         }
1498
1499         if (skb->protocol != htons(ETH_P_IP)) {
1500                 /* Not IP (i.e. ARP). Do not create route, if it is
1501                  * invalid for proxy arp. DNAT routes are always valid.
1502                  *
1503                  * Proxy arp feature have been extended to allow, ARP
1504                  * replies back to the same interface, to support
1505                  * Private VLAN switch technologies. See arp.c.
1506                  */
1507                 if (out_dev == in_dev &&
1508                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1509                         err = -EINVAL;
1510                         goto cleanup;
1511                 }
1512         }
1513
1514         if (do_cache) {
1515                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1516                 if (rt_cache_valid(rth)) {
1517                         skb_dst_set_noref(skb, &rth->dst);
1518                         goto out;
1519                 }
1520         }
1521
1522         rth = rt_dst_alloc(out_dev->dev,
1523                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1524                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1525         if (!rth) {
1526                 err = -ENOBUFS;
1527                 goto cleanup;
1528         }
1529
1530         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1531         rth->rt_flags = flags;
1532         rth->rt_type = res->type;
1533         rth->rt_is_input = 1;
1534         rth->rt_iif     = 0;
1535         rth->rt_pmtu    = 0;
1536         rth->rt_gateway = 0;
1537         rth->rt_uses_gateway = 0;
1538         INIT_LIST_HEAD(&rth->rt_uncached);
1539
1540         rth->dst.input = ip_forward;
1541         rth->dst.output = ip_output;
1542
1543         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1544         skb_dst_set(skb, &rth->dst);
1545 out:
1546         err = 0;
1547  cleanup:
1548         return err;
1549 }
1550
1551 static int ip_mkroute_input(struct sk_buff *skb,
1552                             struct fib_result *res,
1553                             const struct flowi4 *fl4,
1554                             struct in_device *in_dev,
1555                             __be32 daddr, __be32 saddr, u32 tos)
1556 {
1557 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1558         if (res->fi && res->fi->fib_nhs > 1)
1559                 fib_select_multipath(res);
1560 #endif
1561
1562         /* create a routing cache entry */
1563         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1564 }
1565
1566 /*
1567  *      NOTE. We drop all the packets that has local source
1568  *      addresses, because every properly looped back packet
1569  *      must have correct destination already attached by output routine.
1570  *
1571  *      Such approach solves two big problems:
1572  *      1. Not simplex devices are handled properly.
1573  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1574  *      called with rcu_read_lock()
1575  */
1576
1577 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1578                                u8 tos, struct net_device *dev)
1579 {
1580         struct fib_result res;
1581         struct in_device *in_dev = __in_dev_get_rcu(dev);
1582         struct flowi4   fl4;
1583         unsigned int    flags = 0;
1584         u32             itag = 0;
1585         struct rtable   *rth;
1586         int             err = -EINVAL;
1587         struct net    *net = dev_net(dev);
1588         bool do_cache;
1589
1590         /* IP on this device is disabled. */
1591
1592         if (!in_dev)
1593                 goto out;
1594
1595         /* Check for the most weird martians, which can be not detected
1596            by fib_lookup.
1597          */
1598
1599         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1600                 goto martian_source;
1601
1602         res.fi = NULL;
1603         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1604                 goto brd_input;
1605
1606         /* Accept zero addresses only to limited broadcast;
1607          * I even do not know to fix it or not. Waiting for complains :-)
1608          */
1609         if (ipv4_is_zeronet(saddr))
1610                 goto martian_source;
1611
1612         if (ipv4_is_zeronet(daddr))
1613                 goto martian_destination;
1614
1615         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1616          * and call it once if daddr or/and saddr are loopback addresses
1617          */
1618         if (ipv4_is_loopback(daddr)) {
1619                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1620                         goto martian_destination;
1621         } else if (ipv4_is_loopback(saddr)) {
1622                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1623                         goto martian_source;
1624         }
1625
1626         /*
1627          *      Now we are ready to route packet.
1628          */
1629         fl4.flowi4_oif = 0;
1630         fl4.flowi4_iif = dev->ifindex;
1631         fl4.flowi4_mark = skb->mark;
1632         fl4.flowi4_tos = tos;
1633         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1634         fl4.daddr = daddr;
1635         fl4.saddr = saddr;
1636         err = fib_lookup(net, &fl4, &res);
1637         if (err != 0)
1638                 goto no_route;
1639
1640         RT_CACHE_STAT_INC(in_slow_tot);
1641
1642         if (res.type == RTN_BROADCAST)
1643                 goto brd_input;
1644
1645         if (res.type == RTN_LOCAL) {
1646                 err = fib_validate_source(skb, saddr, daddr, tos,
1647                                           LOOPBACK_IFINDEX,
1648                                           dev, in_dev, &itag);
1649                 if (err < 0)
1650                         goto martian_source_keep_err;
1651                 goto local_input;
1652         }
1653
1654         if (!IN_DEV_FORWARD(in_dev))
1655                 goto no_route;
1656         if (res.type != RTN_UNICAST)
1657                 goto martian_destination;
1658
1659         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1660 out:    return err;
1661
1662 brd_input:
1663         if (skb->protocol != htons(ETH_P_IP))
1664                 goto e_inval;
1665
1666         if (!ipv4_is_zeronet(saddr)) {
1667                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1668                                           in_dev, &itag);
1669                 if (err < 0)
1670                         goto martian_source_keep_err;
1671         }
1672         flags |= RTCF_BROADCAST;
1673         res.type = RTN_BROADCAST;
1674         RT_CACHE_STAT_INC(in_brd);
1675
1676 local_input:
1677         do_cache = false;
1678         if (res.fi) {
1679                 if (!itag) {
1680                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1681                         if (rt_cache_valid(rth)) {
1682                                 skb_dst_set_noref(skb, &rth->dst);
1683                                 err = 0;
1684                                 goto out;
1685                         }
1686                         do_cache = true;
1687                 }
1688         }
1689
1690         rth = rt_dst_alloc(net->loopback_dev,
1691                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1692         if (!rth)
1693                 goto e_nobufs;
1694
1695         rth->dst.input= ip_local_deliver;
1696         rth->dst.output= ip_rt_bug;
1697 #ifdef CONFIG_IP_ROUTE_CLASSID
1698         rth->dst.tclassid = itag;
1699 #endif
1700
1701         rth->rt_genid = rt_genid(net);
1702         rth->rt_flags   = flags|RTCF_LOCAL;
1703         rth->rt_type    = res.type;
1704         rth->rt_is_input = 1;
1705         rth->rt_iif     = 0;
1706         rth->rt_pmtu    = 0;
1707         rth->rt_gateway = 0;
1708         rth->rt_uses_gateway = 0;
1709         INIT_LIST_HEAD(&rth->rt_uncached);
1710         if (res.type == RTN_UNREACHABLE) {
1711                 rth->dst.input= ip_error;
1712                 rth->dst.error= -err;
1713                 rth->rt_flags   &= ~RTCF_LOCAL;
1714         }
1715         if (do_cache)
1716                 rt_cache_route(&FIB_RES_NH(res), rth);
1717         skb_dst_set(skb, &rth->dst);
1718         err = 0;
1719         goto out;
1720
1721 no_route:
1722         RT_CACHE_STAT_INC(in_no_route);
1723         res.type = RTN_UNREACHABLE;
1724         if (err == -ESRCH)
1725                 err = -ENETUNREACH;
1726         goto local_input;
1727
1728         /*
1729          *      Do not cache martian addresses: they should be logged (RFC1812)
1730          */
1731 martian_destination:
1732         RT_CACHE_STAT_INC(in_martian_dst);
1733 #ifdef CONFIG_IP_ROUTE_VERBOSE
1734         if (IN_DEV_LOG_MARTIANS(in_dev))
1735                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1736                                      &daddr, &saddr, dev->name);
1737 #endif
1738
1739 e_inval:
1740         err = -EINVAL;
1741         goto out;
1742
1743 e_nobufs:
1744         err = -ENOBUFS;
1745         goto out;
1746
1747 martian_source:
1748         err = -EINVAL;
1749 martian_source_keep_err:
1750         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1751         goto out;
1752 }
1753
1754 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1755                          u8 tos, struct net_device *dev)
1756 {
1757         int res;
1758
1759         rcu_read_lock();
1760
1761         /* Multicast recognition logic is moved from route cache to here.
1762            The problem was that too many Ethernet cards have broken/missing
1763            hardware multicast filters :-( As result the host on multicasting
1764            network acquires a lot of useless route cache entries, sort of
1765            SDR messages from all the world. Now we try to get rid of them.
1766            Really, provided software IP multicast filter is organized
1767            reasonably (at least, hashed), it does not result in a slowdown
1768            comparing with route cache reject entries.
1769            Note, that multicast routers are not affected, because
1770            route cache entry is created eventually.
1771          */
1772         if (ipv4_is_multicast(daddr)) {
1773                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1774
1775                 if (in_dev) {
1776                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1777                                                   ip_hdr(skb)->protocol);
1778                         if (our
1779 #ifdef CONFIG_IP_MROUTE
1780                                 ||
1781                             (!ipv4_is_local_multicast(daddr) &&
1782                              IN_DEV_MFORWARD(in_dev))
1783 #endif
1784                            ) {
1785                                 int res = ip_route_input_mc(skb, daddr, saddr,
1786                                                             tos, dev, our);
1787                                 rcu_read_unlock();
1788                                 return res;
1789                         }
1790                 }
1791                 rcu_read_unlock();
1792                 return -EINVAL;
1793         }
1794         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1795         rcu_read_unlock();
1796         return res;
1797 }
1798 EXPORT_SYMBOL(ip_route_input_noref);
1799
1800 /* called with rcu_read_lock() */
1801 static struct rtable *__mkroute_output(const struct fib_result *res,
1802                                        const struct flowi4 *fl4, int orig_oif,
1803                                        struct net_device *dev_out,
1804                                        unsigned int flags)
1805 {
1806         struct fib_info *fi = res->fi;
1807         struct fib_nh_exception *fnhe;
1808         struct in_device *in_dev;
1809         u16 type = res->type;
1810         struct rtable *rth;
1811         bool do_cache;
1812
1813         in_dev = __in_dev_get_rcu(dev_out);
1814         if (!in_dev)
1815                 return ERR_PTR(-EINVAL);
1816
1817         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1818                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1819                         return ERR_PTR(-EINVAL);
1820
1821         if (ipv4_is_lbcast(fl4->daddr))
1822                 type = RTN_BROADCAST;
1823         else if (ipv4_is_multicast(fl4->daddr))
1824                 type = RTN_MULTICAST;
1825         else if (ipv4_is_zeronet(fl4->daddr))
1826                 return ERR_PTR(-EINVAL);
1827
1828         if (dev_out->flags & IFF_LOOPBACK)
1829                 flags |= RTCF_LOCAL;
1830
1831         do_cache = true;
1832         if (type == RTN_BROADCAST) {
1833                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1834                 fi = NULL;
1835         } else if (type == RTN_MULTICAST) {
1836                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1837                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1838                                      fl4->flowi4_proto))
1839                         flags &= ~RTCF_LOCAL;
1840                 else
1841                         do_cache = false;
1842                 /* If multicast route do not exist use
1843                  * default one, but do not gateway in this case.
1844                  * Yes, it is hack.
1845                  */
1846                 if (fi && res->prefixlen < 4)
1847                         fi = NULL;
1848         }
1849
1850         fnhe = NULL;
1851         do_cache &= fi != NULL;
1852         if (do_cache) {
1853                 struct rtable __rcu **prth;
1854                 struct fib_nh *nh = &FIB_RES_NH(*res);
1855
1856                 fnhe = find_exception(nh, fl4->daddr);
1857                 if (fnhe)
1858                         prth = &fnhe->fnhe_rth;
1859                 else {
1860                         if (unlikely(fl4->flowi4_flags &
1861                                      FLOWI_FLAG_KNOWN_NH &&
1862                                      !(nh->nh_gw &&
1863                                        nh->nh_scope == RT_SCOPE_LINK))) {
1864                                 do_cache = false;
1865                                 goto add;
1866                         }
1867                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1868                 }
1869                 rth = rcu_dereference(*prth);
1870                 if (rt_cache_valid(rth)) {
1871                         dst_hold(&rth->dst);
1872                         return rth;
1873                 }
1874         }
1875
1876 add:
1877         rth = rt_dst_alloc(dev_out,
1878                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1879                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1880                            do_cache);
1881         if (!rth)
1882                 return ERR_PTR(-ENOBUFS);
1883
1884         rth->dst.output = ip_output;
1885
1886         rth->rt_genid = rt_genid(dev_net(dev_out));
1887         rth->rt_flags   = flags;
1888         rth->rt_type    = type;
1889         rth->rt_is_input = 0;
1890         rth->rt_iif     = orig_oif ? : 0;
1891         rth->rt_pmtu    = 0;
1892         rth->rt_gateway = 0;
1893         rth->rt_uses_gateway = 0;
1894         INIT_LIST_HEAD(&rth->rt_uncached);
1895
1896         RT_CACHE_STAT_INC(out_slow_tot);
1897
1898         if (flags & RTCF_LOCAL)
1899                 rth->dst.input = ip_local_deliver;
1900         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1901                 if (flags & RTCF_LOCAL &&
1902                     !(dev_out->flags & IFF_LOOPBACK)) {
1903                         rth->dst.output = ip_mc_output;
1904                         RT_CACHE_STAT_INC(out_slow_mc);
1905                 }
1906 #ifdef CONFIG_IP_MROUTE
1907                 if (type == RTN_MULTICAST) {
1908                         if (IN_DEV_MFORWARD(in_dev) &&
1909                             !ipv4_is_local_multicast(fl4->daddr)) {
1910                                 rth->dst.input = ip_mr_input;
1911                                 rth->dst.output = ip_mc_output;
1912                         }
1913                 }
1914 #endif
1915         }
1916
1917         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1918
1919         return rth;
1920 }
1921
1922 /*
1923  * Major route resolver routine.
1924  */
1925
1926 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1927 {
1928         struct net_device *dev_out = NULL;
1929         __u8 tos = RT_FL_TOS(fl4);
1930         unsigned int flags = 0;
1931         struct fib_result res;
1932         struct rtable *rth;
1933         int orig_oif;
1934
1935         res.tclassid    = 0;
1936         res.fi          = NULL;
1937         res.table       = NULL;
1938
1939         orig_oif = fl4->flowi4_oif;
1940
1941         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1942         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1943         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1944                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1945
1946         rcu_read_lock();
1947         if (fl4->saddr) {
1948                 rth = ERR_PTR(-EINVAL);
1949                 if (ipv4_is_multicast(fl4->saddr) ||
1950                     ipv4_is_lbcast(fl4->saddr) ||
1951                     ipv4_is_zeronet(fl4->saddr))
1952                         goto out;
1953
1954                 /* I removed check for oif == dev_out->oif here.
1955                    It was wrong for two reasons:
1956                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1957                       is assigned to multiple interfaces.
1958                    2. Moreover, we are allowed to send packets with saddr
1959                       of another iface. --ANK
1960                  */
1961
1962                 if (fl4->flowi4_oif == 0 &&
1963                     (ipv4_is_multicast(fl4->daddr) ||
1964                      ipv4_is_lbcast(fl4->daddr))) {
1965                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1966                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1967                         if (dev_out == NULL)
1968                                 goto out;
1969
1970                         /* Special hack: user can direct multicasts
1971                            and limited broadcast via necessary interface
1972                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1973                            This hack is not just for fun, it allows
1974                            vic,vat and friends to work.
1975                            They bind socket to loopback, set ttl to zero
1976                            and expect that it will work.
1977                            From the viewpoint of routing cache they are broken,
1978                            because we are not allowed to build multicast path
1979                            with loopback source addr (look, routing cache
1980                            cannot know, that ttl is zero, so that packet
1981                            will not leave this host and route is valid).
1982                            Luckily, this hack is good workaround.
1983                          */
1984
1985                         fl4->flowi4_oif = dev_out->ifindex;
1986                         goto make_route;
1987                 }
1988
1989                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1990                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1991                         if (!__ip_dev_find(net, fl4->saddr, false))
1992                                 goto out;
1993                 }
1994         }
1995
1996
1997         if (fl4->flowi4_oif) {
1998                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1999                 rth = ERR_PTR(-ENODEV);
2000                 if (dev_out == NULL)
2001                         goto out;
2002
2003                 /* RACE: Check return value of inet_select_addr instead. */
2004                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2005                         rth = ERR_PTR(-ENETUNREACH);
2006                         goto out;
2007                 }
2008                 if (ipv4_is_local_multicast(fl4->daddr) ||
2009                     ipv4_is_lbcast(fl4->daddr)) {
2010                         if (!fl4->saddr)
2011                                 fl4->saddr = inet_select_addr(dev_out, 0,
2012                                                               RT_SCOPE_LINK);
2013                         goto make_route;
2014                 }
2015                 if (fl4->saddr) {
2016                         if (ipv4_is_multicast(fl4->daddr))
2017                                 fl4->saddr = inet_select_addr(dev_out, 0,
2018                                                               fl4->flowi4_scope);
2019                         else if (!fl4->daddr)
2020                                 fl4->saddr = inet_select_addr(dev_out, 0,
2021                                                               RT_SCOPE_HOST);
2022                 }
2023         }
2024
2025         if (!fl4->daddr) {
2026                 fl4->daddr = fl4->saddr;
2027                 if (!fl4->daddr)
2028                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2029                 dev_out = net->loopback_dev;
2030                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2031                 res.type = RTN_LOCAL;
2032                 flags |= RTCF_LOCAL;
2033                 goto make_route;
2034         }
2035
2036         if (fib_lookup(net, fl4, &res)) {
2037                 res.fi = NULL;
2038                 res.table = NULL;
2039                 if (fl4->flowi4_oif) {
2040                         /* Apparently, routing tables are wrong. Assume,
2041                            that the destination is on link.
2042
2043                            WHY? DW.
2044                            Because we are allowed to send to iface
2045                            even if it has NO routes and NO assigned
2046                            addresses. When oif is specified, routing
2047                            tables are looked up with only one purpose:
2048                            to catch if destination is gatewayed, rather than
2049                            direct. Moreover, if MSG_DONTROUTE is set,
2050                            we send packet, ignoring both routing tables
2051                            and ifaddr state. --ANK
2052
2053
2054                            We could make it even if oif is unknown,
2055                            likely IPv6, but we do not.
2056                          */
2057
2058                         if (fl4->saddr == 0)
2059                                 fl4->saddr = inet_select_addr(dev_out, 0,
2060                                                               RT_SCOPE_LINK);
2061                         res.type = RTN_UNICAST;
2062                         goto make_route;
2063                 }
2064                 rth = ERR_PTR(-ENETUNREACH);
2065                 goto out;
2066         }
2067
2068         if (res.type == RTN_LOCAL) {
2069                 if (!fl4->saddr) {
2070                         if (res.fi->fib_prefsrc)
2071                                 fl4->saddr = res.fi->fib_prefsrc;
2072                         else
2073                                 fl4->saddr = fl4->daddr;
2074                 }
2075                 dev_out = net->loopback_dev;
2076                 fl4->flowi4_oif = dev_out->ifindex;
2077                 flags |= RTCF_LOCAL;
2078                 goto make_route;
2079         }
2080
2081 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2082         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2083                 fib_select_multipath(&res);
2084         else
2085 #endif
2086         if (!res.prefixlen &&
2087             res.table->tb_num_default > 1 &&
2088             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2089                 fib_select_default(&res);
2090
2091         if (!fl4->saddr)
2092                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2093
2094         dev_out = FIB_RES_DEV(res);
2095         fl4->flowi4_oif = dev_out->ifindex;
2096
2097
2098 make_route:
2099         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2100
2101 out:
2102         rcu_read_unlock();
2103         return rth;
2104 }
2105 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2106
2107 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2108 {
2109         return NULL;
2110 }
2111
2112 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2113 {
2114         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2115
2116         return mtu ? : dst->dev->mtu;
2117 }
2118
2119 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2120                                           struct sk_buff *skb, u32 mtu)
2121 {
2122 }
2123
2124 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2125                                        struct sk_buff *skb)
2126 {
2127 }
2128
2129 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2130                                           unsigned long old)
2131 {
2132         return NULL;
2133 }
2134
2135 static struct dst_ops ipv4_dst_blackhole_ops = {
2136         .family                 =       AF_INET,
2137         .protocol               =       cpu_to_be16(ETH_P_IP),
2138         .check                  =       ipv4_blackhole_dst_check,
2139         .mtu                    =       ipv4_blackhole_mtu,
2140         .default_advmss         =       ipv4_default_advmss,
2141         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2142         .redirect               =       ipv4_rt_blackhole_redirect,
2143         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2144         .neigh_lookup           =       ipv4_neigh_lookup,
2145 };
2146
2147 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2148 {
2149         struct rtable *ort = (struct rtable *) dst_orig;
2150         struct rtable *rt;
2151
2152         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2153         if (rt) {
2154                 struct dst_entry *new = &rt->dst;
2155
2156                 new->__use = 1;
2157                 new->input = dst_discard;
2158                 new->output = dst_discard;
2159
2160                 new->dev = ort->dst.dev;
2161                 if (new->dev)
2162                         dev_hold(new->dev);
2163
2164                 rt->rt_is_input = ort->rt_is_input;
2165                 rt->rt_iif = ort->rt_iif;
2166                 rt->rt_pmtu = ort->rt_pmtu;
2167
2168                 rt->rt_genid = rt_genid(net);
2169                 rt->rt_flags = ort->rt_flags;
2170                 rt->rt_type = ort->rt_type;
2171                 rt->rt_gateway = ort->rt_gateway;
2172                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2173
2174                 INIT_LIST_HEAD(&rt->rt_uncached);
2175
2176                 dst_free(new);
2177         }
2178
2179         dst_release(dst_orig);
2180
2181         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2182 }
2183
2184 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2185                                     struct sock *sk)
2186 {
2187         struct rtable *rt = __ip_route_output_key(net, flp4);
2188
2189         if (IS_ERR(rt))
2190                 return rt;
2191
2192         if (flp4->flowi4_proto)
2193                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2194                                                    flowi4_to_flowi(flp4),
2195                                                    sk, 0);
2196
2197         return rt;
2198 }
2199 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2200
2201 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2202                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2203                         u32 seq, int event, int nowait, unsigned int flags)
2204 {
2205         struct rtable *rt = skb_rtable(skb);
2206         struct rtmsg *r;
2207         struct nlmsghdr *nlh;
2208         unsigned long expires = 0;
2209         u32 error;
2210         u32 metrics[RTAX_MAX];
2211
2212         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2213         if (nlh == NULL)
2214                 return -EMSGSIZE;
2215
2216         r = nlmsg_data(nlh);
2217         r->rtm_family    = AF_INET;
2218         r->rtm_dst_len  = 32;
2219         r->rtm_src_len  = 0;
2220         r->rtm_tos      = fl4->flowi4_tos;
2221         r->rtm_table    = RT_TABLE_MAIN;
2222         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2223                 goto nla_put_failure;
2224         r->rtm_type     = rt->rt_type;
2225         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2226         r->rtm_protocol = RTPROT_UNSPEC;
2227         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2228         if (rt->rt_flags & RTCF_NOTIFY)
2229                 r->rtm_flags |= RTM_F_NOTIFY;
2230
2231         if (nla_put_be32(skb, RTA_DST, dst))
2232                 goto nla_put_failure;
2233         if (src) {
2234                 r->rtm_src_len = 32;
2235                 if (nla_put_be32(skb, RTA_SRC, src))
2236                         goto nla_put_failure;
2237         }
2238         if (rt->dst.dev &&
2239             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2240                 goto nla_put_failure;
2241 #ifdef CONFIG_IP_ROUTE_CLASSID
2242         if (rt->dst.tclassid &&
2243             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2244                 goto nla_put_failure;
2245 #endif
2246         if (!rt_is_input_route(rt) &&
2247             fl4->saddr != src) {
2248                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2249                         goto nla_put_failure;
2250         }
2251         if (rt->rt_uses_gateway &&
2252             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2253                 goto nla_put_failure;
2254
2255         expires = rt->dst.expires;
2256         if (expires) {
2257                 unsigned long now = jiffies;
2258
2259                 if (time_before(now, expires))
2260                         expires -= now;
2261                 else
2262                         expires = 0;
2263         }
2264
2265         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2266         if (rt->rt_pmtu && expires)
2267                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2268         if (rtnetlink_put_metrics(skb, metrics) < 0)
2269                 goto nla_put_failure;
2270
2271         if (fl4->flowi4_mark &&
2272             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2273                 goto nla_put_failure;
2274
2275         error = rt->dst.error;
2276
2277         if (rt_is_input_route(rt)) {
2278 #ifdef CONFIG_IP_MROUTE
2279                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2280                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2281                         int err = ipmr_get_route(net, skb,
2282                                                  fl4->saddr, fl4->daddr,
2283                                                  r, nowait);
2284                         if (err <= 0) {
2285                                 if (!nowait) {
2286                                         if (err == 0)
2287                                                 return 0;
2288                                         goto nla_put_failure;
2289                                 } else {
2290                                         if (err == -EMSGSIZE)
2291                                                 goto nla_put_failure;
2292                                         error = err;
2293                                 }
2294                         }
2295                 } else
2296 #endif
2297                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2298                                 goto nla_put_failure;
2299         }
2300
2301         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2302                 goto nla_put_failure;
2303
2304         return nlmsg_end(skb, nlh);
2305
2306 nla_put_failure:
2307         nlmsg_cancel(skb, nlh);
2308         return -EMSGSIZE;
2309 }
2310
2311 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2312 {
2313         struct net *net = sock_net(in_skb->sk);
2314         struct rtmsg *rtm;
2315         struct nlattr *tb[RTA_MAX+1];
2316         struct rtable *rt = NULL;
2317         struct flowi4 fl4;
2318         __be32 dst = 0;
2319         __be32 src = 0;
2320         u32 iif;
2321         int err;
2322         int mark;
2323         struct sk_buff *skb;
2324
2325         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2326         if (err < 0)
2327                 goto errout;
2328
2329         rtm = nlmsg_data(nlh);
2330
2331         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2332         if (skb == NULL) {
2333                 err = -ENOBUFS;
2334                 goto errout;
2335         }
2336
2337         /* Reserve room for dummy headers, this skb can pass
2338            through good chunk of routing engine.
2339          */
2340         skb_reset_mac_header(skb);
2341         skb_reset_network_header(skb);
2342
2343         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2344         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2345         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2346
2347         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2348         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2349         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2350         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2351
2352         memset(&fl4, 0, sizeof(fl4));
2353         fl4.daddr = dst;
2354         fl4.saddr = src;
2355         fl4.flowi4_tos = rtm->rtm_tos;
2356         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2357         fl4.flowi4_mark = mark;
2358
2359         if (iif) {
2360                 struct net_device *dev;
2361
2362                 dev = __dev_get_by_index(net, iif);
2363                 if (dev == NULL) {
2364                         err = -ENODEV;
2365                         goto errout_free;
2366                 }
2367
2368                 skb->protocol   = htons(ETH_P_IP);
2369                 skb->dev        = dev;
2370                 skb->mark       = mark;
2371                 local_bh_disable();
2372                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2373                 local_bh_enable();
2374
2375                 rt = skb_rtable(skb);
2376                 if (err == 0 && rt->dst.error)
2377                         err = -rt->dst.error;
2378         } else {
2379                 rt = ip_route_output_key(net, &fl4);
2380
2381                 err = 0;
2382                 if (IS_ERR(rt))
2383                         err = PTR_ERR(rt);
2384         }
2385
2386         if (err)
2387                 goto errout_free;
2388
2389         skb_dst_set(skb, &rt->dst);
2390         if (rtm->rtm_flags & RTM_F_NOTIFY)
2391                 rt->rt_flags |= RTCF_NOTIFY;
2392
2393         err = rt_fill_info(net, dst, src, &fl4, skb,
2394                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2395                            RTM_NEWROUTE, 0, 0);
2396         if (err <= 0)
2397                 goto errout_free;
2398
2399         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2400 errout:
2401         return err;
2402
2403 errout_free:
2404         kfree_skb(skb);
2405         goto errout;
2406 }
2407
2408 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2409 {
2410         return skb->len;
2411 }
2412
2413 void ip_rt_multicast_event(struct in_device *in_dev)
2414 {
2415         rt_cache_flush(dev_net(in_dev->dev));
2416 }
2417
2418 #ifdef CONFIG_SYSCTL
2419 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2420                                         void __user *buffer,
2421                                         size_t *lenp, loff_t *ppos)
2422 {
2423         if (write) {
2424                 rt_cache_flush((struct net *)__ctl->extra1);
2425                 return 0;
2426         }
2427
2428         return -EINVAL;
2429 }
2430
2431 static ctl_table ipv4_route_table[] = {
2432         {
2433                 .procname       = "gc_thresh",
2434                 .data           = &ipv4_dst_ops.gc_thresh,
2435                 .maxlen         = sizeof(int),
2436                 .mode           = 0644,
2437                 .proc_handler   = proc_dointvec,
2438         },
2439         {
2440                 .procname       = "max_size",
2441                 .data           = &ip_rt_max_size,
2442                 .maxlen         = sizeof(int),
2443                 .mode           = 0644,
2444                 .proc_handler   = proc_dointvec,
2445         },
2446         {
2447                 /*  Deprecated. Use gc_min_interval_ms */
2448
2449                 .procname       = "gc_min_interval",
2450                 .data           = &ip_rt_gc_min_interval,
2451                 .maxlen         = sizeof(int),
2452                 .mode           = 0644,
2453                 .proc_handler   = proc_dointvec_jiffies,
2454         },
2455         {
2456                 .procname       = "gc_min_interval_ms",
2457                 .data           = &ip_rt_gc_min_interval,
2458                 .maxlen         = sizeof(int),
2459                 .mode           = 0644,
2460                 .proc_handler   = proc_dointvec_ms_jiffies,
2461         },
2462         {
2463                 .procname       = "gc_timeout",
2464                 .data           = &ip_rt_gc_timeout,
2465                 .maxlen         = sizeof(int),
2466                 .mode           = 0644,
2467                 .proc_handler   = proc_dointvec_jiffies,
2468         },
2469         {
2470                 .procname       = "gc_interval",
2471                 .data           = &ip_rt_gc_interval,
2472                 .maxlen         = sizeof(int),
2473                 .mode           = 0644,
2474                 .proc_handler   = proc_dointvec_jiffies,
2475         },
2476         {
2477                 .procname       = "redirect_load",
2478                 .data           = &ip_rt_redirect_load,
2479                 .maxlen         = sizeof(int),
2480                 .mode           = 0644,
2481                 .proc_handler   = proc_dointvec,
2482         },
2483         {
2484                 .procname       = "redirect_number",
2485                 .data           = &ip_rt_redirect_number,
2486                 .maxlen         = sizeof(int),
2487                 .mode           = 0644,
2488                 .proc_handler   = proc_dointvec,
2489         },
2490         {
2491                 .procname       = "redirect_silence",
2492                 .data           = &ip_rt_redirect_silence,
2493                 .maxlen         = sizeof(int),
2494                 .mode           = 0644,
2495                 .proc_handler   = proc_dointvec,
2496         },
2497         {
2498                 .procname       = "error_cost",
2499                 .data           = &ip_rt_error_cost,
2500                 .maxlen         = sizeof(int),
2501                 .mode           = 0644,
2502                 .proc_handler   = proc_dointvec,
2503         },
2504         {
2505                 .procname       = "error_burst",
2506                 .data           = &ip_rt_error_burst,
2507                 .maxlen         = sizeof(int),
2508                 .mode           = 0644,
2509                 .proc_handler   = proc_dointvec,
2510         },
2511         {
2512                 .procname       = "gc_elasticity",
2513                 .data           = &ip_rt_gc_elasticity,
2514                 .maxlen         = sizeof(int),
2515                 .mode           = 0644,
2516                 .proc_handler   = proc_dointvec,
2517         },
2518         {
2519                 .procname       = "mtu_expires",
2520                 .data           = &ip_rt_mtu_expires,
2521                 .maxlen         = sizeof(int),
2522                 .mode           = 0644,
2523                 .proc_handler   = proc_dointvec_jiffies,
2524         },
2525         {
2526                 .procname       = "min_pmtu",
2527                 .data           = &ip_rt_min_pmtu,
2528                 .maxlen         = sizeof(int),
2529                 .mode           = 0644,
2530                 .proc_handler   = proc_dointvec,
2531         },
2532         {
2533                 .procname       = "min_adv_mss",
2534                 .data           = &ip_rt_min_advmss,
2535                 .maxlen         = sizeof(int),
2536                 .mode           = 0644,
2537                 .proc_handler   = proc_dointvec,
2538         },
2539         { }
2540 };
2541
2542 static struct ctl_table ipv4_route_flush_table[] = {
2543         {
2544                 .procname       = "flush",
2545                 .maxlen         = sizeof(int),
2546                 .mode           = 0200,
2547                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2548         },
2549         { },
2550 };
2551
2552 static __net_init int sysctl_route_net_init(struct net *net)
2553 {
2554         struct ctl_table *tbl;
2555
2556         tbl = ipv4_route_flush_table;
2557         if (!net_eq(net, &init_net)) {
2558                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2559                 if (tbl == NULL)
2560                         goto err_dup;
2561
2562                 /* Don't export sysctls to unprivileged users */
2563                 if (net->user_ns != &init_user_ns)
2564                         tbl[0].procname = NULL;
2565         }
2566         tbl[0].extra1 = net;
2567
2568         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2569         if (net->ipv4.route_hdr == NULL)
2570                 goto err_reg;
2571         return 0;
2572
2573 err_reg:
2574         if (tbl != ipv4_route_flush_table)
2575                 kfree(tbl);
2576 err_dup:
2577         return -ENOMEM;
2578 }
2579
2580 static __net_exit void sysctl_route_net_exit(struct net *net)
2581 {
2582         struct ctl_table *tbl;
2583
2584         tbl = net->ipv4.route_hdr->ctl_table_arg;
2585         unregister_net_sysctl_table(net->ipv4.route_hdr);
2586         BUG_ON(tbl == ipv4_route_flush_table);
2587         kfree(tbl);
2588 }
2589
2590 static __net_initdata struct pernet_operations sysctl_route_ops = {
2591         .init = sysctl_route_net_init,
2592         .exit = sysctl_route_net_exit,
2593 };
2594 #endif
2595
2596 static __net_init int rt_genid_init(struct net *net)
2597 {
2598         atomic_set(&net->rt_genid, 0);
2599         get_random_bytes(&net->ipv4.dev_addr_genid,
2600                          sizeof(net->ipv4.dev_addr_genid));
2601         return 0;
2602 }
2603
2604 static __net_initdata struct pernet_operations rt_genid_ops = {
2605         .init = rt_genid_init,
2606 };
2607
2608 static int __net_init ipv4_inetpeer_init(struct net *net)
2609 {
2610         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2611
2612         if (!bp)
2613                 return -ENOMEM;
2614         inet_peer_base_init(bp);
2615         net->ipv4.peers = bp;
2616         return 0;
2617 }
2618
2619 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2620 {
2621         struct inet_peer_base *bp = net->ipv4.peers;
2622
2623         net->ipv4.peers = NULL;
2624         inetpeer_invalidate_tree(bp);
2625         kfree(bp);
2626 }
2627
2628 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2629         .init   =       ipv4_inetpeer_init,
2630         .exit   =       ipv4_inetpeer_exit,
2631 };
2632
2633 #ifdef CONFIG_IP_ROUTE_CLASSID
2634 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2635 #endif /* CONFIG_IP_ROUTE_CLASSID */
2636
2637 int __init ip_rt_init(void)
2638 {
2639         int rc = 0;
2640
2641 #ifdef CONFIG_IP_ROUTE_CLASSID
2642         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2643         if (!ip_rt_acct)
2644                 panic("IP: failed to allocate ip_rt_acct\n");
2645 #endif
2646
2647         ipv4_dst_ops.kmem_cachep =
2648                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2649                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2650
2651         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2652
2653         if (dst_entries_init(&ipv4_dst_ops) < 0)
2654                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2655
2656         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2657                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2658
2659         ipv4_dst_ops.gc_thresh = ~0;
2660         ip_rt_max_size = INT_MAX;
2661
2662         devinet_init();
2663         ip_fib_init();
2664
2665         if (ip_rt_proc_init())
2666                 pr_err("Unable to create route proc files\n");
2667 #ifdef CONFIG_XFRM
2668         xfrm_init();
2669         xfrm4_init();
2670 #endif
2671         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2672
2673 #ifdef CONFIG_SYSCTL
2674         register_pernet_subsys(&sysctl_route_ops);
2675 #endif
2676         register_pernet_subsys(&rt_genid_ops);
2677         register_pernet_subsys(&ipv4_inetpeer_ops);
2678         return rc;
2679 }
2680
2681 #ifdef CONFIG_SYSCTL
2682 /*
2683  * We really need to sanitize the damn ipv4 init order, then all
2684  * this nonsense will go away.
2685  */
2686 void __init ip_static_sysctl_init(void)
2687 {
2688         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2689 }
2690 #endif