]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv4/route.c
Merge branch 'for-3.13/logitech' into for-next
[karo-tx-linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 /* IPv4 datagram length is stored into 16bit field (tot_len) */
116 #define IP_MAX_MTU      0xFFFF
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 /*
131  *      Interface to generic destination cache.
132  */
133
134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
135 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
136 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
137 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
138 static void              ipv4_link_failure(struct sk_buff *skb);
139 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
140                                            struct sk_buff *skb, u32 mtu);
141 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
142                                         struct sk_buff *skb);
143 static void             ipv4_dst_destroy(struct dst_entry *dst);
144
145 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
146                             int how)
147 {
148 }
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             cpu_to_be16(ETH_P_IP),
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .ifdown =               ipv4_dst_ifdown,
169         .negative_advice =      ipv4_negative_advice,
170         .link_failure =         ipv4_link_failure,
171         .update_pmtu =          ip_rt_update_pmtu,
172         .redirect =             ip_do_redirect,
173         .local_out =            __ip_local_out,
174         .neigh_lookup =         ipv4_neigh_lookup,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .owner   = THIS_MODULE,
244         .open    = rt_cache_seq_open,
245         .read    = seq_read,
246         .llseek  = seq_lseek,
247         .release = seq_release,
248 };
249
250
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252 {
253         int cpu;
254
255         if (*pos == 0)
256                 return SEQ_START_TOKEN;
257
258         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259                 if (!cpu_possible(cpu))
260                         continue;
261                 *pos = cpu+1;
262                 return &per_cpu(rt_cache_stat, cpu);
263         }
264         return NULL;
265 }
266
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269         int cpu;
270
271         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272                 if (!cpu_possible(cpu))
273                         continue;
274                 *pos = cpu+1;
275                 return &per_cpu(rt_cache_stat, cpu);
276         }
277         return NULL;
278
279 }
280
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283
284 }
285
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288         struct rt_cache_stat *st = v;
289
290         if (v == SEQ_START_TOKEN) {
291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292                 return 0;
293         }
294
295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297                    dst_entries_get_slow(&ipv4_dst_ops),
298                    st->in_hit,
299                    st->in_slow_tot,
300                    st->in_slow_mc,
301                    st->in_no_route,
302                    st->in_brd,
303                    st->in_martian_dst,
304                    st->in_martian_src,
305
306                    st->out_hit,
307                    st->out_slow_tot,
308                    st->out_slow_mc,
309
310                    st->gc_total,
311                    st->gc_ignored,
312                    st->gc_goal_miss,
313                    st->gc_dst_overflow,
314                    st->in_hlist_search,
315                    st->out_hlist_search
316                 );
317         return 0;
318 }
319
320 static const struct seq_operations rt_cpu_seq_ops = {
321         .start  = rt_cpu_seq_start,
322         .next   = rt_cpu_seq_next,
323         .stop   = rt_cpu_seq_stop,
324         .show   = rt_cpu_seq_show,
325 };
326
327
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330         return seq_open(file, &rt_cpu_seq_ops);
331 }
332
333 static const struct file_operations rt_cpu_seq_fops = {
334         .owner   = THIS_MODULE,
335         .open    = rt_cpu_seq_open,
336         .read    = seq_read,
337         .llseek  = seq_lseek,
338         .release = seq_release,
339 };
340
341 #ifdef CONFIG_IP_ROUTE_CLASSID
342 static int rt_acct_proc_show(struct seq_file *m, void *v)
343 {
344         struct ip_rt_acct *dst, *src;
345         unsigned int i, j;
346
347         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348         if (!dst)
349                 return -ENOMEM;
350
351         for_each_possible_cpu(i) {
352                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353                 for (j = 0; j < 256; j++) {
354                         dst[j].o_bytes   += src[j].o_bytes;
355                         dst[j].o_packets += src[j].o_packets;
356                         dst[j].i_bytes   += src[j].i_bytes;
357                         dst[j].i_packets += src[j].i_packets;
358                 }
359         }
360
361         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362         kfree(dst);
363         return 0;
364 }
365
366 static int rt_acct_proc_open(struct inode *inode, struct file *file)
367 {
368         return single_open(file, rt_acct_proc_show, NULL);
369 }
370
371 static const struct file_operations rt_acct_proc_fops = {
372         .owner          = THIS_MODULE,
373         .open           = rt_acct_proc_open,
374         .read           = seq_read,
375         .llseek         = seq_lseek,
376         .release        = single_release,
377 };
378 #endif
379
380 static int __net_init ip_rt_do_proc_init(struct net *net)
381 {
382         struct proc_dir_entry *pde;
383
384         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385                           &rt_cache_seq_fops);
386         if (!pde)
387                 goto err1;
388
389         pde = proc_create("rt_cache", S_IRUGO,
390                           net->proc_net_stat, &rt_cpu_seq_fops);
391         if (!pde)
392                 goto err2;
393
394 #ifdef CONFIG_IP_ROUTE_CLASSID
395         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
396         if (!pde)
397                 goto err3;
398 #endif
399         return 0;
400
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 err3:
403         remove_proc_entry("rt_cache", net->proc_net_stat);
404 #endif
405 err2:
406         remove_proc_entry("rt_cache", net->proc_net);
407 err1:
408         return -ENOMEM;
409 }
410
411 static void __net_exit ip_rt_do_proc_exit(struct net *net)
412 {
413         remove_proc_entry("rt_cache", net->proc_net_stat);
414         remove_proc_entry("rt_cache", net->proc_net);
415 #ifdef CONFIG_IP_ROUTE_CLASSID
416         remove_proc_entry("rt_acct", net->proc_net);
417 #endif
418 }
419
420 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
421         .init = ip_rt_do_proc_init,
422         .exit = ip_rt_do_proc_exit,
423 };
424
425 static int __init ip_rt_proc_init(void)
426 {
427         return register_pernet_subsys(&ip_rt_proc_ops);
428 }
429
430 #else
431 static inline int ip_rt_proc_init(void)
432 {
433         return 0;
434 }
435 #endif /* CONFIG_PROC_FS */
436
437 static inline bool rt_is_expired(const struct rtable *rth)
438 {
439         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
440 }
441
442 void rt_cache_flush(struct net *net)
443 {
444         rt_genid_bump_ipv4(net);
445 }
446
447 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448                                            struct sk_buff *skb,
449                                            const void *daddr)
450 {
451         struct net_device *dev = dst->dev;
452         const __be32 *pkey = daddr;
453         const struct rtable *rt;
454         struct neighbour *n;
455
456         rt = (const struct rtable *) dst;
457         if (rt->rt_gateway)
458                 pkey = (const __be32 *) &rt->rt_gateway;
459         else if (skb)
460                 pkey = &ip_hdr(skb)->daddr;
461
462         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
463         if (n)
464                 return n;
465         return neigh_create(&arp_tbl, pkey, dev);
466 }
467
468 /*
469  * Peer allocation may fail only in serious out-of-memory conditions.  However
470  * we still can generate some output.
471  * Random ID selection looks a bit dangerous because we have no chances to
472  * select ID being unique in a reasonable period of time.
473  * But broken packet identifier may be better than no packet at all.
474  */
475 static void ip_select_fb_ident(struct iphdr *iph)
476 {
477         static DEFINE_SPINLOCK(ip_fb_id_lock);
478         static u32 ip_fallback_id;
479         u32 salt;
480
481         spin_lock_bh(&ip_fb_id_lock);
482         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
483         iph->id = htons(salt & 0xFFFF);
484         ip_fallback_id = salt;
485         spin_unlock_bh(&ip_fb_id_lock);
486 }
487
488 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
489 {
490         struct net *net = dev_net(dst->dev);
491         struct inet_peer *peer;
492
493         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
494         if (peer) {
495                 iph->id = htons(inet_getid(peer, more));
496                 inet_putpeer(peer);
497                 return;
498         }
499
500         ip_select_fb_ident(iph);
501 }
502 EXPORT_SYMBOL(__ip_select_ident);
503
504 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
505                              const struct iphdr *iph,
506                              int oif, u8 tos,
507                              u8 prot, u32 mark, int flow_flags)
508 {
509         if (sk) {
510                 const struct inet_sock *inet = inet_sk(sk);
511
512                 oif = sk->sk_bound_dev_if;
513                 mark = sk->sk_mark;
514                 tos = RT_CONN_FLAGS(sk);
515                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
516         }
517         flowi4_init_output(fl4, oif, mark, tos,
518                            RT_SCOPE_UNIVERSE, prot,
519                            flow_flags,
520                            iph->daddr, iph->saddr, 0, 0);
521 }
522
523 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
524                                const struct sock *sk)
525 {
526         const struct iphdr *iph = ip_hdr(skb);
527         int oif = skb->dev->ifindex;
528         u8 tos = RT_TOS(iph->tos);
529         u8 prot = iph->protocol;
530         u32 mark = skb->mark;
531
532         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
533 }
534
535 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
536 {
537         const struct inet_sock *inet = inet_sk(sk);
538         const struct ip_options_rcu *inet_opt;
539         __be32 daddr = inet->inet_daddr;
540
541         rcu_read_lock();
542         inet_opt = rcu_dereference(inet->inet_opt);
543         if (inet_opt && inet_opt->opt.srr)
544                 daddr = inet_opt->opt.faddr;
545         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
546                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
547                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
548                            inet_sk_flowi_flags(sk),
549                            daddr, inet->inet_saddr, 0, 0);
550         rcu_read_unlock();
551 }
552
553 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
554                                  const struct sk_buff *skb)
555 {
556         if (skb)
557                 build_skb_flow_key(fl4, skb, sk);
558         else
559                 build_sk_flow_key(fl4, sk);
560 }
561
562 static inline void rt_free(struct rtable *rt)
563 {
564         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
565 }
566
567 static DEFINE_SPINLOCK(fnhe_lock);
568
569 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
570 {
571         struct rtable *rt;
572
573         rt = rcu_dereference(fnhe->fnhe_rth_input);
574         if (rt) {
575                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
576                 rt_free(rt);
577         }
578         rt = rcu_dereference(fnhe->fnhe_rth_output);
579         if (rt) {
580                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
581                 rt_free(rt);
582         }
583 }
584
585 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
586 {
587         struct fib_nh_exception *fnhe, *oldest;
588
589         oldest = rcu_dereference(hash->chain);
590         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
591              fnhe = rcu_dereference(fnhe->fnhe_next)) {
592                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
593                         oldest = fnhe;
594         }
595         fnhe_flush_routes(oldest);
596         return oldest;
597 }
598
599 static inline u32 fnhe_hashfun(__be32 daddr)
600 {
601         u32 hval;
602
603         hval = (__force u32) daddr;
604         hval ^= (hval >> 11) ^ (hval >> 22);
605
606         return hval & (FNHE_HASH_SIZE - 1);
607 }
608
609 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
610 {
611         rt->rt_pmtu = fnhe->fnhe_pmtu;
612         rt->dst.expires = fnhe->fnhe_expires;
613
614         if (fnhe->fnhe_gw) {
615                 rt->rt_flags |= RTCF_REDIRECTED;
616                 rt->rt_gateway = fnhe->fnhe_gw;
617                 rt->rt_uses_gateway = 1;
618         }
619 }
620
621 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
622                                   u32 pmtu, unsigned long expires)
623 {
624         struct fnhe_hash_bucket *hash;
625         struct fib_nh_exception *fnhe;
626         struct rtable *rt;
627         unsigned int i;
628         int depth;
629         u32 hval = fnhe_hashfun(daddr);
630
631         spin_lock_bh(&fnhe_lock);
632
633         hash = nh->nh_exceptions;
634         if (!hash) {
635                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
636                 if (!hash)
637                         goto out_unlock;
638                 nh->nh_exceptions = hash;
639         }
640
641         hash += hval;
642
643         depth = 0;
644         for (fnhe = rcu_dereference(hash->chain); fnhe;
645              fnhe = rcu_dereference(fnhe->fnhe_next)) {
646                 if (fnhe->fnhe_daddr == daddr)
647                         break;
648                 depth++;
649         }
650
651         if (fnhe) {
652                 if (gw)
653                         fnhe->fnhe_gw = gw;
654                 if (pmtu) {
655                         fnhe->fnhe_pmtu = pmtu;
656                         fnhe->fnhe_expires = max(1UL, expires);
657                 }
658                 /* Update all cached dsts too */
659                 rt = rcu_dereference(fnhe->fnhe_rth_input);
660                 if (rt)
661                         fill_route_from_fnhe(rt, fnhe);
662                 rt = rcu_dereference(fnhe->fnhe_rth_output);
663                 if (rt)
664                         fill_route_from_fnhe(rt, fnhe);
665         } else {
666                 if (depth > FNHE_RECLAIM_DEPTH)
667                         fnhe = fnhe_oldest(hash);
668                 else {
669                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
670                         if (!fnhe)
671                                 goto out_unlock;
672
673                         fnhe->fnhe_next = hash->chain;
674                         rcu_assign_pointer(hash->chain, fnhe);
675                 }
676                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
677                 fnhe->fnhe_daddr = daddr;
678                 fnhe->fnhe_gw = gw;
679                 fnhe->fnhe_pmtu = pmtu;
680                 fnhe->fnhe_expires = expires;
681
682                 /* Exception created; mark the cached routes for the nexthop
683                  * stale, so anyone caching it rechecks if this exception
684                  * applies to them.
685                  */
686                 rt = rcu_dereference(nh->nh_rth_input);
687                 if (rt)
688                         rt->dst.obsolete = DST_OBSOLETE_KILL;
689
690                 for_each_possible_cpu(i) {
691                         struct rtable __rcu **prt;
692                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
693                         rt = rcu_dereference(*prt);
694                         if (rt)
695                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
696                 }
697         }
698
699         fnhe->fnhe_stamp = jiffies;
700
701 out_unlock:
702         spin_unlock_bh(&fnhe_lock);
703         return;
704 }
705
706 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
707                              bool kill_route)
708 {
709         __be32 new_gw = icmp_hdr(skb)->un.gateway;
710         __be32 old_gw = ip_hdr(skb)->saddr;
711         struct net_device *dev = skb->dev;
712         struct in_device *in_dev;
713         struct fib_result res;
714         struct neighbour *n;
715         struct net *net;
716
717         switch (icmp_hdr(skb)->code & 7) {
718         case ICMP_REDIR_NET:
719         case ICMP_REDIR_NETTOS:
720         case ICMP_REDIR_HOST:
721         case ICMP_REDIR_HOSTTOS:
722                 break;
723
724         default:
725                 return;
726         }
727
728         if (rt->rt_gateway != old_gw)
729                 return;
730
731         in_dev = __in_dev_get_rcu(dev);
732         if (!in_dev)
733                 return;
734
735         net = dev_net(dev);
736         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
737             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
738             ipv4_is_zeronet(new_gw))
739                 goto reject_redirect;
740
741         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
742                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
743                         goto reject_redirect;
744                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
745                         goto reject_redirect;
746         } else {
747                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
748                         goto reject_redirect;
749         }
750
751         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
752         if (n) {
753                 if (!(n->nud_state & NUD_VALID)) {
754                         neigh_event_send(n, NULL);
755                 } else {
756                         if (fib_lookup(net, fl4, &res) == 0) {
757                                 struct fib_nh *nh = &FIB_RES_NH(res);
758
759                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
760                                                       0, 0);
761                         }
762                         if (kill_route)
763                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
764                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
765                 }
766                 neigh_release(n);
767         }
768         return;
769
770 reject_redirect:
771 #ifdef CONFIG_IP_ROUTE_VERBOSE
772         if (IN_DEV_LOG_MARTIANS(in_dev)) {
773                 const struct iphdr *iph = (const struct iphdr *) skb->data;
774                 __be32 daddr = iph->daddr;
775                 __be32 saddr = iph->saddr;
776
777                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
778                                      "  Advised path = %pI4 -> %pI4\n",
779                                      &old_gw, dev->name, &new_gw,
780                                      &saddr, &daddr);
781         }
782 #endif
783         ;
784 }
785
786 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
787 {
788         struct rtable *rt;
789         struct flowi4 fl4;
790         const struct iphdr *iph = (const struct iphdr *) skb->data;
791         int oif = skb->dev->ifindex;
792         u8 tos = RT_TOS(iph->tos);
793         u8 prot = iph->protocol;
794         u32 mark = skb->mark;
795
796         rt = (struct rtable *) dst;
797
798         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
799         __ip_do_redirect(rt, skb, &fl4, true);
800 }
801
802 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
803 {
804         struct rtable *rt = (struct rtable *)dst;
805         struct dst_entry *ret = dst;
806
807         if (rt) {
808                 if (dst->obsolete > 0) {
809                         ip_rt_put(rt);
810                         ret = NULL;
811                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
812                            rt->dst.expires) {
813                         ip_rt_put(rt);
814                         ret = NULL;
815                 }
816         }
817         return ret;
818 }
819
820 /*
821  * Algorithm:
822  *      1. The first ip_rt_redirect_number redirects are sent
823  *         with exponential backoff, then we stop sending them at all,
824  *         assuming that the host ignores our redirects.
825  *      2. If we did not see packets requiring redirects
826  *         during ip_rt_redirect_silence, we assume that the host
827  *         forgot redirected route and start to send redirects again.
828  *
829  * This algorithm is much cheaper and more intelligent than dumb load limiting
830  * in icmp.c.
831  *
832  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
833  * and "frag. need" (breaks PMTU discovery) in icmp.c.
834  */
835
836 void ip_rt_send_redirect(struct sk_buff *skb)
837 {
838         struct rtable *rt = skb_rtable(skb);
839         struct in_device *in_dev;
840         struct inet_peer *peer;
841         struct net *net;
842         int log_martians;
843
844         rcu_read_lock();
845         in_dev = __in_dev_get_rcu(rt->dst.dev);
846         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
847                 rcu_read_unlock();
848                 return;
849         }
850         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
851         rcu_read_unlock();
852
853         net = dev_net(rt->dst.dev);
854         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
855         if (!peer) {
856                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
857                           rt_nexthop(rt, ip_hdr(skb)->daddr));
858                 return;
859         }
860
861         /* No redirected packets during ip_rt_redirect_silence;
862          * reset the algorithm.
863          */
864         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
865                 peer->rate_tokens = 0;
866
867         /* Too many ignored redirects; do not send anything
868          * set dst.rate_last to the last seen redirected packet.
869          */
870         if (peer->rate_tokens >= ip_rt_redirect_number) {
871                 peer->rate_last = jiffies;
872                 goto out_put_peer;
873         }
874
875         /* Check for load limit; set rate_last to the latest sent
876          * redirect.
877          */
878         if (peer->rate_tokens == 0 ||
879             time_after(jiffies,
880                        (peer->rate_last +
881                         (ip_rt_redirect_load << peer->rate_tokens)))) {
882                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
883
884                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
885                 peer->rate_last = jiffies;
886                 ++peer->rate_tokens;
887 #ifdef CONFIG_IP_ROUTE_VERBOSE
888                 if (log_martians &&
889                     peer->rate_tokens == ip_rt_redirect_number)
890                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
891                                              &ip_hdr(skb)->saddr, inet_iif(skb),
892                                              &ip_hdr(skb)->daddr, &gw);
893 #endif
894         }
895 out_put_peer:
896         inet_putpeer(peer);
897 }
898
899 static int ip_error(struct sk_buff *skb)
900 {
901         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
902         struct rtable *rt = skb_rtable(skb);
903         struct inet_peer *peer;
904         unsigned long now;
905         struct net *net;
906         bool send;
907         int code;
908
909         net = dev_net(rt->dst.dev);
910         if (!IN_DEV_FORWARD(in_dev)) {
911                 switch (rt->dst.error) {
912                 case EHOSTUNREACH:
913                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
914                         break;
915
916                 case ENETUNREACH:
917                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
918                         break;
919                 }
920                 goto out;
921         }
922
923         switch (rt->dst.error) {
924         case EINVAL:
925         default:
926                 goto out;
927         case EHOSTUNREACH:
928                 code = ICMP_HOST_UNREACH;
929                 break;
930         case ENETUNREACH:
931                 code = ICMP_NET_UNREACH;
932                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
933                 break;
934         case EACCES:
935                 code = ICMP_PKT_FILTERED;
936                 break;
937         }
938
939         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
940
941         send = true;
942         if (peer) {
943                 now = jiffies;
944                 peer->rate_tokens += now - peer->rate_last;
945                 if (peer->rate_tokens > ip_rt_error_burst)
946                         peer->rate_tokens = ip_rt_error_burst;
947                 peer->rate_last = now;
948                 if (peer->rate_tokens >= ip_rt_error_cost)
949                         peer->rate_tokens -= ip_rt_error_cost;
950                 else
951                         send = false;
952                 inet_putpeer(peer);
953         }
954         if (send)
955                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
956
957 out:    kfree_skb(skb);
958         return 0;
959 }
960
961 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
962 {
963         struct dst_entry *dst = &rt->dst;
964         struct fib_result res;
965
966         if (dst_metric_locked(dst, RTAX_MTU))
967                 return;
968
969         if (dst->dev->mtu < mtu)
970                 return;
971
972         if (mtu < ip_rt_min_pmtu)
973                 mtu = ip_rt_min_pmtu;
974
975         if (rt->rt_pmtu == mtu &&
976             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
977                 return;
978
979         rcu_read_lock();
980         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
981                 struct fib_nh *nh = &FIB_RES_NH(res);
982
983                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
984                                       jiffies + ip_rt_mtu_expires);
985         }
986         rcu_read_unlock();
987 }
988
989 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
990                               struct sk_buff *skb, u32 mtu)
991 {
992         struct rtable *rt = (struct rtable *) dst;
993         struct flowi4 fl4;
994
995         ip_rt_build_flow_key(&fl4, sk, skb);
996         __ip_rt_update_pmtu(rt, &fl4, mtu);
997 }
998
999 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1000                       int oif, u32 mark, u8 protocol, int flow_flags)
1001 {
1002         const struct iphdr *iph = (const struct iphdr *) skb->data;
1003         struct flowi4 fl4;
1004         struct rtable *rt;
1005
1006         __build_flow_key(&fl4, NULL, iph, oif,
1007                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1008         rt = __ip_route_output_key(net, &fl4);
1009         if (!IS_ERR(rt)) {
1010                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1011                 ip_rt_put(rt);
1012         }
1013 }
1014 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1015
1016 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1017 {
1018         const struct iphdr *iph = (const struct iphdr *) skb->data;
1019         struct flowi4 fl4;
1020         struct rtable *rt;
1021
1022         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1023         rt = __ip_route_output_key(sock_net(sk), &fl4);
1024         if (!IS_ERR(rt)) {
1025                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1026                 ip_rt_put(rt);
1027         }
1028 }
1029
1030 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1031 {
1032         const struct iphdr *iph = (const struct iphdr *) skb->data;
1033         struct flowi4 fl4;
1034         struct rtable *rt;
1035         struct dst_entry *dst;
1036         bool new = false;
1037
1038         bh_lock_sock(sk);
1039         rt = (struct rtable *) __sk_dst_get(sk);
1040
1041         if (sock_owned_by_user(sk) || !rt) {
1042                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1043                 goto out;
1044         }
1045
1046         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1047
1048         if (!__sk_dst_check(sk, 0)) {
1049                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1050                 if (IS_ERR(rt))
1051                         goto out;
1052
1053                 new = true;
1054         }
1055
1056         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1057
1058         dst = dst_check(&rt->dst, 0);
1059         if (!dst) {
1060                 if (new)
1061                         dst_release(&rt->dst);
1062
1063                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1064                 if (IS_ERR(rt))
1065                         goto out;
1066
1067                 new = true;
1068         }
1069
1070         if (new)
1071                 __sk_dst_set(sk, &rt->dst);
1072
1073 out:
1074         bh_unlock_sock(sk);
1075 }
1076 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1077
1078 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1079                    int oif, u32 mark, u8 protocol, int flow_flags)
1080 {
1081         const struct iphdr *iph = (const struct iphdr *) skb->data;
1082         struct flowi4 fl4;
1083         struct rtable *rt;
1084
1085         __build_flow_key(&fl4, NULL, iph, oif,
1086                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1087         rt = __ip_route_output_key(net, &fl4);
1088         if (!IS_ERR(rt)) {
1089                 __ip_do_redirect(rt, skb, &fl4, false);
1090                 ip_rt_put(rt);
1091         }
1092 }
1093 EXPORT_SYMBOL_GPL(ipv4_redirect);
1094
1095 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1096 {
1097         const struct iphdr *iph = (const struct iphdr *) skb->data;
1098         struct flowi4 fl4;
1099         struct rtable *rt;
1100
1101         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1102         rt = __ip_route_output_key(sock_net(sk), &fl4);
1103         if (!IS_ERR(rt)) {
1104                 __ip_do_redirect(rt, skb, &fl4, false);
1105                 ip_rt_put(rt);
1106         }
1107 }
1108 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1109
1110 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1111 {
1112         struct rtable *rt = (struct rtable *) dst;
1113
1114         /* All IPV4 dsts are created with ->obsolete set to the value
1115          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1116          * into this function always.
1117          *
1118          * When a PMTU/redirect information update invalidates a route,
1119          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1120          * DST_OBSOLETE_DEAD by dst_free().
1121          */
1122         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1123                 return NULL;
1124         return dst;
1125 }
1126
1127 static void ipv4_link_failure(struct sk_buff *skb)
1128 {
1129         struct rtable *rt;
1130
1131         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1132
1133         rt = skb_rtable(skb);
1134         if (rt)
1135                 dst_set_expires(&rt->dst, 0);
1136 }
1137
1138 static int ip_rt_bug(struct sk_buff *skb)
1139 {
1140         pr_debug("%s: %pI4 -> %pI4, %s\n",
1141                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1142                  skb->dev ? skb->dev->name : "?");
1143         kfree_skb(skb);
1144         WARN_ON(1);
1145         return 0;
1146 }
1147
1148 /*
1149    We do not cache source address of outgoing interface,
1150    because it is used only by IP RR, TS and SRR options,
1151    so that it out of fast path.
1152
1153    BTW remember: "addr" is allowed to be not aligned
1154    in IP options!
1155  */
1156
1157 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1158 {
1159         __be32 src;
1160
1161         if (rt_is_output_route(rt))
1162                 src = ip_hdr(skb)->saddr;
1163         else {
1164                 struct fib_result res;
1165                 struct flowi4 fl4;
1166                 struct iphdr *iph;
1167
1168                 iph = ip_hdr(skb);
1169
1170                 memset(&fl4, 0, sizeof(fl4));
1171                 fl4.daddr = iph->daddr;
1172                 fl4.saddr = iph->saddr;
1173                 fl4.flowi4_tos = RT_TOS(iph->tos);
1174                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1175                 fl4.flowi4_iif = skb->dev->ifindex;
1176                 fl4.flowi4_mark = skb->mark;
1177
1178                 rcu_read_lock();
1179                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1180                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1181                 else
1182                         src = inet_select_addr(rt->dst.dev,
1183                                                rt_nexthop(rt, iph->daddr),
1184                                                RT_SCOPE_UNIVERSE);
1185                 rcu_read_unlock();
1186         }
1187         memcpy(addr, &src, 4);
1188 }
1189
1190 #ifdef CONFIG_IP_ROUTE_CLASSID
1191 static void set_class_tag(struct rtable *rt, u32 tag)
1192 {
1193         if (!(rt->dst.tclassid & 0xFFFF))
1194                 rt->dst.tclassid |= tag & 0xFFFF;
1195         if (!(rt->dst.tclassid & 0xFFFF0000))
1196                 rt->dst.tclassid |= tag & 0xFFFF0000;
1197 }
1198 #endif
1199
1200 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1201 {
1202         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1203
1204         if (advmss == 0) {
1205                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1206                                ip_rt_min_advmss);
1207                 if (advmss > 65535 - 40)
1208                         advmss = 65535 - 40;
1209         }
1210         return advmss;
1211 }
1212
1213 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1214 {
1215         const struct rtable *rt = (const struct rtable *) dst;
1216         unsigned int mtu = rt->rt_pmtu;
1217
1218         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1219                 mtu = dst_metric_raw(dst, RTAX_MTU);
1220
1221         if (mtu)
1222                 return mtu;
1223
1224         mtu = dst->dev->mtu;
1225
1226         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1227                 if (rt->rt_uses_gateway && mtu > 576)
1228                         mtu = 576;
1229         }
1230
1231         return min_t(unsigned int, mtu, IP_MAX_MTU);
1232 }
1233
1234 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1235 {
1236         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1237         struct fib_nh_exception *fnhe;
1238         u32 hval;
1239
1240         if (!hash)
1241                 return NULL;
1242
1243         hval = fnhe_hashfun(daddr);
1244
1245         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1246              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1247                 if (fnhe->fnhe_daddr == daddr)
1248                         return fnhe;
1249         }
1250         return NULL;
1251 }
1252
1253 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1254                               __be32 daddr)
1255 {
1256         bool ret = false;
1257
1258         spin_lock_bh(&fnhe_lock);
1259
1260         if (daddr == fnhe->fnhe_daddr) {
1261                 struct rtable __rcu **porig;
1262                 struct rtable *orig;
1263                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1264
1265                 if (rt_is_input_route(rt))
1266                         porig = &fnhe->fnhe_rth_input;
1267                 else
1268                         porig = &fnhe->fnhe_rth_output;
1269                 orig = rcu_dereference(*porig);
1270
1271                 if (fnhe->fnhe_genid != genid) {
1272                         fnhe->fnhe_genid = genid;
1273                         fnhe->fnhe_gw = 0;
1274                         fnhe->fnhe_pmtu = 0;
1275                         fnhe->fnhe_expires = 0;
1276                         fnhe_flush_routes(fnhe);
1277                         orig = NULL;
1278                 }
1279                 fill_route_from_fnhe(rt, fnhe);
1280                 if (!rt->rt_gateway)
1281                         rt->rt_gateway = daddr;
1282
1283                 if (!(rt->dst.flags & DST_NOCACHE)) {
1284                         rcu_assign_pointer(*porig, rt);
1285                         if (orig)
1286                                 rt_free(orig);
1287                         ret = true;
1288                 }
1289
1290                 fnhe->fnhe_stamp = jiffies;
1291         }
1292         spin_unlock_bh(&fnhe_lock);
1293
1294         return ret;
1295 }
1296
1297 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1298 {
1299         struct rtable *orig, *prev, **p;
1300         bool ret = true;
1301
1302         if (rt_is_input_route(rt)) {
1303                 p = (struct rtable **)&nh->nh_rth_input;
1304         } else {
1305                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1306         }
1307         orig = *p;
1308
1309         prev = cmpxchg(p, orig, rt);
1310         if (prev == orig) {
1311                 if (orig)
1312                         rt_free(orig);
1313         } else
1314                 ret = false;
1315
1316         return ret;
1317 }
1318
1319 static DEFINE_SPINLOCK(rt_uncached_lock);
1320 static LIST_HEAD(rt_uncached_list);
1321
1322 static void rt_add_uncached_list(struct rtable *rt)
1323 {
1324         spin_lock_bh(&rt_uncached_lock);
1325         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1326         spin_unlock_bh(&rt_uncached_lock);
1327 }
1328
1329 static void ipv4_dst_destroy(struct dst_entry *dst)
1330 {
1331         struct rtable *rt = (struct rtable *) dst;
1332
1333         if (!list_empty(&rt->rt_uncached)) {
1334                 spin_lock_bh(&rt_uncached_lock);
1335                 list_del(&rt->rt_uncached);
1336                 spin_unlock_bh(&rt_uncached_lock);
1337         }
1338 }
1339
1340 void rt_flush_dev(struct net_device *dev)
1341 {
1342         if (!list_empty(&rt_uncached_list)) {
1343                 struct net *net = dev_net(dev);
1344                 struct rtable *rt;
1345
1346                 spin_lock_bh(&rt_uncached_lock);
1347                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1348                         if (rt->dst.dev != dev)
1349                                 continue;
1350                         rt->dst.dev = net->loopback_dev;
1351                         dev_hold(rt->dst.dev);
1352                         dev_put(dev);
1353                 }
1354                 spin_unlock_bh(&rt_uncached_lock);
1355         }
1356 }
1357
1358 static bool rt_cache_valid(const struct rtable *rt)
1359 {
1360         return  rt &&
1361                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1362                 !rt_is_expired(rt);
1363 }
1364
1365 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1366                            const struct fib_result *res,
1367                            struct fib_nh_exception *fnhe,
1368                            struct fib_info *fi, u16 type, u32 itag)
1369 {
1370         bool cached = false;
1371
1372         if (fi) {
1373                 struct fib_nh *nh = &FIB_RES_NH(*res);
1374
1375                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1376                         rt->rt_gateway = nh->nh_gw;
1377                         rt->rt_uses_gateway = 1;
1378                 }
1379                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1380 #ifdef CONFIG_IP_ROUTE_CLASSID
1381                 rt->dst.tclassid = nh->nh_tclassid;
1382 #endif
1383                 if (unlikely(fnhe))
1384                         cached = rt_bind_exception(rt, fnhe, daddr);
1385                 else if (!(rt->dst.flags & DST_NOCACHE))
1386                         cached = rt_cache_route(nh, rt);
1387                 if (unlikely(!cached)) {
1388                         /* Routes we intend to cache in nexthop exception or
1389                          * FIB nexthop have the DST_NOCACHE bit clear.
1390                          * However, if we are unsuccessful at storing this
1391                          * route into the cache we really need to set it.
1392                          */
1393                         rt->dst.flags |= DST_NOCACHE;
1394                         if (!rt->rt_gateway)
1395                                 rt->rt_gateway = daddr;
1396                         rt_add_uncached_list(rt);
1397                 }
1398         } else
1399                 rt_add_uncached_list(rt);
1400
1401 #ifdef CONFIG_IP_ROUTE_CLASSID
1402 #ifdef CONFIG_IP_MULTIPLE_TABLES
1403         set_class_tag(rt, res->tclassid);
1404 #endif
1405         set_class_tag(rt, itag);
1406 #endif
1407 }
1408
1409 static struct rtable *rt_dst_alloc(struct net_device *dev,
1410                                    bool nopolicy, bool noxfrm, bool will_cache)
1411 {
1412         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1413                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1414                          (nopolicy ? DST_NOPOLICY : 0) |
1415                          (noxfrm ? DST_NOXFRM : 0));
1416 }
1417
1418 /* called in rcu_read_lock() section */
1419 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1420                                 u8 tos, struct net_device *dev, int our)
1421 {
1422         struct rtable *rth;
1423         struct in_device *in_dev = __in_dev_get_rcu(dev);
1424         u32 itag = 0;
1425         int err;
1426
1427         /* Primary sanity checks. */
1428
1429         if (in_dev == NULL)
1430                 return -EINVAL;
1431
1432         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1433             skb->protocol != htons(ETH_P_IP))
1434                 goto e_inval;
1435
1436         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1437                 if (ipv4_is_loopback(saddr))
1438                         goto e_inval;
1439
1440         if (ipv4_is_zeronet(saddr)) {
1441                 if (!ipv4_is_local_multicast(daddr))
1442                         goto e_inval;
1443         } else {
1444                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1445                                           in_dev, &itag);
1446                 if (err < 0)
1447                         goto e_err;
1448         }
1449         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1450                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1451         if (!rth)
1452                 goto e_nobufs;
1453
1454 #ifdef CONFIG_IP_ROUTE_CLASSID
1455         rth->dst.tclassid = itag;
1456 #endif
1457         rth->dst.output = ip_rt_bug;
1458
1459         rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1460         rth->rt_flags   = RTCF_MULTICAST;
1461         rth->rt_type    = RTN_MULTICAST;
1462         rth->rt_is_input= 1;
1463         rth->rt_iif     = 0;
1464         rth->rt_pmtu    = 0;
1465         rth->rt_gateway = 0;
1466         rth->rt_uses_gateway = 0;
1467         INIT_LIST_HEAD(&rth->rt_uncached);
1468         if (our) {
1469                 rth->dst.input= ip_local_deliver;
1470                 rth->rt_flags |= RTCF_LOCAL;
1471         }
1472
1473 #ifdef CONFIG_IP_MROUTE
1474         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1475                 rth->dst.input = ip_mr_input;
1476 #endif
1477         RT_CACHE_STAT_INC(in_slow_mc);
1478
1479         skb_dst_set(skb, &rth->dst);
1480         return 0;
1481
1482 e_nobufs:
1483         return -ENOBUFS;
1484 e_inval:
1485         return -EINVAL;
1486 e_err:
1487         return err;
1488 }
1489
1490
1491 static void ip_handle_martian_source(struct net_device *dev,
1492                                      struct in_device *in_dev,
1493                                      struct sk_buff *skb,
1494                                      __be32 daddr,
1495                                      __be32 saddr)
1496 {
1497         RT_CACHE_STAT_INC(in_martian_src);
1498 #ifdef CONFIG_IP_ROUTE_VERBOSE
1499         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1500                 /*
1501                  *      RFC1812 recommendation, if source is martian,
1502                  *      the only hint is MAC header.
1503                  */
1504                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1505                         &daddr, &saddr, dev->name);
1506                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1507                         print_hex_dump(KERN_WARNING, "ll header: ",
1508                                        DUMP_PREFIX_OFFSET, 16, 1,
1509                                        skb_mac_header(skb),
1510                                        dev->hard_header_len, true);
1511                 }
1512         }
1513 #endif
1514 }
1515
1516 /* called in rcu_read_lock() section */
1517 static int __mkroute_input(struct sk_buff *skb,
1518                            const struct fib_result *res,
1519                            struct in_device *in_dev,
1520                            __be32 daddr, __be32 saddr, u32 tos)
1521 {
1522         struct fib_nh_exception *fnhe;
1523         struct rtable *rth;
1524         int err;
1525         struct in_device *out_dev;
1526         unsigned int flags = 0;
1527         bool do_cache;
1528         u32 itag;
1529
1530         /* get a working reference to the output device */
1531         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1532         if (out_dev == NULL) {
1533                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1534                 return -EINVAL;
1535         }
1536
1537         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1538                                   in_dev->dev, in_dev, &itag);
1539         if (err < 0) {
1540                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1541                                          saddr);
1542
1543                 goto cleanup;
1544         }
1545
1546         do_cache = res->fi && !itag;
1547         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1548             (IN_DEV_SHARED_MEDIA(out_dev) ||
1549              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1550                 flags |= RTCF_DOREDIRECT;
1551                 do_cache = false;
1552         }
1553
1554         if (skb->protocol != htons(ETH_P_IP)) {
1555                 /* Not IP (i.e. ARP). Do not create route, if it is
1556                  * invalid for proxy arp. DNAT routes are always valid.
1557                  *
1558                  * Proxy arp feature have been extended to allow, ARP
1559                  * replies back to the same interface, to support
1560                  * Private VLAN switch technologies. See arp.c.
1561                  */
1562                 if (out_dev == in_dev &&
1563                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1564                         err = -EINVAL;
1565                         goto cleanup;
1566                 }
1567         }
1568
1569         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1570         if (do_cache) {
1571                 if (fnhe != NULL)
1572                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1573                 else
1574                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1575
1576                 if (rt_cache_valid(rth)) {
1577                         skb_dst_set_noref(skb, &rth->dst);
1578                         goto out;
1579                 }
1580         }
1581
1582         rth = rt_dst_alloc(out_dev->dev,
1583                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1584                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1585         if (!rth) {
1586                 err = -ENOBUFS;
1587                 goto cleanup;
1588         }
1589
1590         rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1591         rth->rt_flags = flags;
1592         rth->rt_type = res->type;
1593         rth->rt_is_input = 1;
1594         rth->rt_iif     = 0;
1595         rth->rt_pmtu    = 0;
1596         rth->rt_gateway = 0;
1597         rth->rt_uses_gateway = 0;
1598         INIT_LIST_HEAD(&rth->rt_uncached);
1599
1600         rth->dst.input = ip_forward;
1601         rth->dst.output = ip_output;
1602
1603         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1604         skb_dst_set(skb, &rth->dst);
1605 out:
1606         err = 0;
1607  cleanup:
1608         return err;
1609 }
1610
1611 static int ip_mkroute_input(struct sk_buff *skb,
1612                             struct fib_result *res,
1613                             const struct flowi4 *fl4,
1614                             struct in_device *in_dev,
1615                             __be32 daddr, __be32 saddr, u32 tos)
1616 {
1617 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1618         if (res->fi && res->fi->fib_nhs > 1)
1619                 fib_select_multipath(res);
1620 #endif
1621
1622         /* create a routing cache entry */
1623         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1624 }
1625
1626 /*
1627  *      NOTE. We drop all the packets that has local source
1628  *      addresses, because every properly looped back packet
1629  *      must have correct destination already attached by output routine.
1630  *
1631  *      Such approach solves two big problems:
1632  *      1. Not simplex devices are handled properly.
1633  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1634  *      called with rcu_read_lock()
1635  */
1636
1637 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1638                                u8 tos, struct net_device *dev)
1639 {
1640         struct fib_result res;
1641         struct in_device *in_dev = __in_dev_get_rcu(dev);
1642         struct flowi4   fl4;
1643         unsigned int    flags = 0;
1644         u32             itag = 0;
1645         struct rtable   *rth;
1646         int             err = -EINVAL;
1647         struct net    *net = dev_net(dev);
1648         bool do_cache;
1649
1650         /* IP on this device is disabled. */
1651
1652         if (!in_dev)
1653                 goto out;
1654
1655         /* Check for the most weird martians, which can be not detected
1656            by fib_lookup.
1657          */
1658
1659         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1660                 goto martian_source;
1661
1662         res.fi = NULL;
1663         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1664                 goto brd_input;
1665
1666         /* Accept zero addresses only to limited broadcast;
1667          * I even do not know to fix it or not. Waiting for complains :-)
1668          */
1669         if (ipv4_is_zeronet(saddr))
1670                 goto martian_source;
1671
1672         if (ipv4_is_zeronet(daddr))
1673                 goto martian_destination;
1674
1675         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1676          * and call it once if daddr or/and saddr are loopback addresses
1677          */
1678         if (ipv4_is_loopback(daddr)) {
1679                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1680                         goto martian_destination;
1681         } else if (ipv4_is_loopback(saddr)) {
1682                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1683                         goto martian_source;
1684         }
1685
1686         /*
1687          *      Now we are ready to route packet.
1688          */
1689         fl4.flowi4_oif = 0;
1690         fl4.flowi4_iif = dev->ifindex;
1691         fl4.flowi4_mark = skb->mark;
1692         fl4.flowi4_tos = tos;
1693         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1694         fl4.daddr = daddr;
1695         fl4.saddr = saddr;
1696         err = fib_lookup(net, &fl4, &res);
1697         if (err != 0)
1698                 goto no_route;
1699
1700         RT_CACHE_STAT_INC(in_slow_tot);
1701
1702         if (res.type == RTN_BROADCAST)
1703                 goto brd_input;
1704
1705         if (res.type == RTN_LOCAL) {
1706                 err = fib_validate_source(skb, saddr, daddr, tos,
1707                                           LOOPBACK_IFINDEX,
1708                                           dev, in_dev, &itag);
1709                 if (err < 0)
1710                         goto martian_source_keep_err;
1711                 goto local_input;
1712         }
1713
1714         if (!IN_DEV_FORWARD(in_dev))
1715                 goto no_route;
1716         if (res.type != RTN_UNICAST)
1717                 goto martian_destination;
1718
1719         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1720 out:    return err;
1721
1722 brd_input:
1723         if (skb->protocol != htons(ETH_P_IP))
1724                 goto e_inval;
1725
1726         if (!ipv4_is_zeronet(saddr)) {
1727                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1728                                           in_dev, &itag);
1729                 if (err < 0)
1730                         goto martian_source_keep_err;
1731         }
1732         flags |= RTCF_BROADCAST;
1733         res.type = RTN_BROADCAST;
1734         RT_CACHE_STAT_INC(in_brd);
1735
1736 local_input:
1737         do_cache = false;
1738         if (res.fi) {
1739                 if (!itag) {
1740                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1741                         if (rt_cache_valid(rth)) {
1742                                 skb_dst_set_noref(skb, &rth->dst);
1743                                 err = 0;
1744                                 goto out;
1745                         }
1746                         do_cache = true;
1747                 }
1748         }
1749
1750         rth = rt_dst_alloc(net->loopback_dev,
1751                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1752         if (!rth)
1753                 goto e_nobufs;
1754
1755         rth->dst.input= ip_local_deliver;
1756         rth->dst.output= ip_rt_bug;
1757 #ifdef CONFIG_IP_ROUTE_CLASSID
1758         rth->dst.tclassid = itag;
1759 #endif
1760
1761         rth->rt_genid = rt_genid_ipv4(net);
1762         rth->rt_flags   = flags|RTCF_LOCAL;
1763         rth->rt_type    = res.type;
1764         rth->rt_is_input = 1;
1765         rth->rt_iif     = 0;
1766         rth->rt_pmtu    = 0;
1767         rth->rt_gateway = 0;
1768         rth->rt_uses_gateway = 0;
1769         INIT_LIST_HEAD(&rth->rt_uncached);
1770         if (res.type == RTN_UNREACHABLE) {
1771                 rth->dst.input= ip_error;
1772                 rth->dst.error= -err;
1773                 rth->rt_flags   &= ~RTCF_LOCAL;
1774         }
1775         if (do_cache)
1776                 rt_cache_route(&FIB_RES_NH(res), rth);
1777         skb_dst_set(skb, &rth->dst);
1778         err = 0;
1779         goto out;
1780
1781 no_route:
1782         RT_CACHE_STAT_INC(in_no_route);
1783         res.type = RTN_UNREACHABLE;
1784         if (err == -ESRCH)
1785                 err = -ENETUNREACH;
1786         goto local_input;
1787
1788         /*
1789          *      Do not cache martian addresses: they should be logged (RFC1812)
1790          */
1791 martian_destination:
1792         RT_CACHE_STAT_INC(in_martian_dst);
1793 #ifdef CONFIG_IP_ROUTE_VERBOSE
1794         if (IN_DEV_LOG_MARTIANS(in_dev))
1795                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1796                                      &daddr, &saddr, dev->name);
1797 #endif
1798
1799 e_inval:
1800         err = -EINVAL;
1801         goto out;
1802
1803 e_nobufs:
1804         err = -ENOBUFS;
1805         goto out;
1806
1807 martian_source:
1808         err = -EINVAL;
1809 martian_source_keep_err:
1810         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1811         goto out;
1812 }
1813
1814 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1815                          u8 tos, struct net_device *dev)
1816 {
1817         int res;
1818
1819         rcu_read_lock();
1820
1821         /* Multicast recognition logic is moved from route cache to here.
1822            The problem was that too many Ethernet cards have broken/missing
1823            hardware multicast filters :-( As result the host on multicasting
1824            network acquires a lot of useless route cache entries, sort of
1825            SDR messages from all the world. Now we try to get rid of them.
1826            Really, provided software IP multicast filter is organized
1827            reasonably (at least, hashed), it does not result in a slowdown
1828            comparing with route cache reject entries.
1829            Note, that multicast routers are not affected, because
1830            route cache entry is created eventually.
1831          */
1832         if (ipv4_is_multicast(daddr)) {
1833                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1834
1835                 if (in_dev) {
1836                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1837                                                   ip_hdr(skb)->protocol);
1838                         if (our
1839 #ifdef CONFIG_IP_MROUTE
1840                                 ||
1841                             (!ipv4_is_local_multicast(daddr) &&
1842                              IN_DEV_MFORWARD(in_dev))
1843 #endif
1844                            ) {
1845                                 int res = ip_route_input_mc(skb, daddr, saddr,
1846                                                             tos, dev, our);
1847                                 rcu_read_unlock();
1848                                 return res;
1849                         }
1850                 }
1851                 rcu_read_unlock();
1852                 return -EINVAL;
1853         }
1854         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1855         rcu_read_unlock();
1856         return res;
1857 }
1858 EXPORT_SYMBOL(ip_route_input_noref);
1859
1860 /* called with rcu_read_lock() */
1861 static struct rtable *__mkroute_output(const struct fib_result *res,
1862                                        const struct flowi4 *fl4, int orig_oif,
1863                                        struct net_device *dev_out,
1864                                        unsigned int flags)
1865 {
1866         struct fib_info *fi = res->fi;
1867         struct fib_nh_exception *fnhe;
1868         struct in_device *in_dev;
1869         u16 type = res->type;
1870         struct rtable *rth;
1871         bool do_cache;
1872
1873         in_dev = __in_dev_get_rcu(dev_out);
1874         if (!in_dev)
1875                 return ERR_PTR(-EINVAL);
1876
1877         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1878                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1879                         return ERR_PTR(-EINVAL);
1880
1881         if (ipv4_is_lbcast(fl4->daddr))
1882                 type = RTN_BROADCAST;
1883         else if (ipv4_is_multicast(fl4->daddr))
1884                 type = RTN_MULTICAST;
1885         else if (ipv4_is_zeronet(fl4->daddr))
1886                 return ERR_PTR(-EINVAL);
1887
1888         if (dev_out->flags & IFF_LOOPBACK)
1889                 flags |= RTCF_LOCAL;
1890
1891         do_cache = true;
1892         if (type == RTN_BROADCAST) {
1893                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1894                 fi = NULL;
1895         } else if (type == RTN_MULTICAST) {
1896                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1897                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1898                                      fl4->flowi4_proto))
1899                         flags &= ~RTCF_LOCAL;
1900                 else
1901                         do_cache = false;
1902                 /* If multicast route do not exist use
1903                  * default one, but do not gateway in this case.
1904                  * Yes, it is hack.
1905                  */
1906                 if (fi && res->prefixlen < 4)
1907                         fi = NULL;
1908         }
1909
1910         fnhe = NULL;
1911         do_cache &= fi != NULL;
1912         if (do_cache) {
1913                 struct rtable __rcu **prth;
1914                 struct fib_nh *nh = &FIB_RES_NH(*res);
1915
1916                 fnhe = find_exception(nh, fl4->daddr);
1917                 if (fnhe)
1918                         prth = &fnhe->fnhe_rth_output;
1919                 else {
1920                         if (unlikely(fl4->flowi4_flags &
1921                                      FLOWI_FLAG_KNOWN_NH &&
1922                                      !(nh->nh_gw &&
1923                                        nh->nh_scope == RT_SCOPE_LINK))) {
1924                                 do_cache = false;
1925                                 goto add;
1926                         }
1927                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1928                 }
1929                 rth = rcu_dereference(*prth);
1930                 if (rt_cache_valid(rth)) {
1931                         dst_hold(&rth->dst);
1932                         return rth;
1933                 }
1934         }
1935
1936 add:
1937         rth = rt_dst_alloc(dev_out,
1938                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1939                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1940                            do_cache);
1941         if (!rth)
1942                 return ERR_PTR(-ENOBUFS);
1943
1944         rth->dst.output = ip_output;
1945
1946         rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1947         rth->rt_flags   = flags;
1948         rth->rt_type    = type;
1949         rth->rt_is_input = 0;
1950         rth->rt_iif     = orig_oif ? : 0;
1951         rth->rt_pmtu    = 0;
1952         rth->rt_gateway = 0;
1953         rth->rt_uses_gateway = 0;
1954         INIT_LIST_HEAD(&rth->rt_uncached);
1955
1956         RT_CACHE_STAT_INC(out_slow_tot);
1957
1958         if (flags & RTCF_LOCAL)
1959                 rth->dst.input = ip_local_deliver;
1960         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1961                 if (flags & RTCF_LOCAL &&
1962                     !(dev_out->flags & IFF_LOOPBACK)) {
1963                         rth->dst.output = ip_mc_output;
1964                         RT_CACHE_STAT_INC(out_slow_mc);
1965                 }
1966 #ifdef CONFIG_IP_MROUTE
1967                 if (type == RTN_MULTICAST) {
1968                         if (IN_DEV_MFORWARD(in_dev) &&
1969                             !ipv4_is_local_multicast(fl4->daddr)) {
1970                                 rth->dst.input = ip_mr_input;
1971                                 rth->dst.output = ip_mc_output;
1972                         }
1973                 }
1974 #endif
1975         }
1976
1977         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1978
1979         return rth;
1980 }
1981
1982 /*
1983  * Major route resolver routine.
1984  */
1985
1986 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1987 {
1988         struct net_device *dev_out = NULL;
1989         __u8 tos = RT_FL_TOS(fl4);
1990         unsigned int flags = 0;
1991         struct fib_result res;
1992         struct rtable *rth;
1993         int orig_oif;
1994
1995         res.tclassid    = 0;
1996         res.fi          = NULL;
1997         res.table       = NULL;
1998
1999         orig_oif = fl4->flowi4_oif;
2000
2001         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2002         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2003         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2004                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2005
2006         rcu_read_lock();
2007         if (fl4->saddr) {
2008                 rth = ERR_PTR(-EINVAL);
2009                 if (ipv4_is_multicast(fl4->saddr) ||
2010                     ipv4_is_lbcast(fl4->saddr) ||
2011                     ipv4_is_zeronet(fl4->saddr))
2012                         goto out;
2013
2014                 /* I removed check for oif == dev_out->oif here.
2015                    It was wrong for two reasons:
2016                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2017                       is assigned to multiple interfaces.
2018                    2. Moreover, we are allowed to send packets with saddr
2019                       of another iface. --ANK
2020                  */
2021
2022                 if (fl4->flowi4_oif == 0 &&
2023                     (ipv4_is_multicast(fl4->daddr) ||
2024                      ipv4_is_lbcast(fl4->daddr))) {
2025                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2026                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2027                         if (dev_out == NULL)
2028                                 goto out;
2029
2030                         /* Special hack: user can direct multicasts
2031                            and limited broadcast via necessary interface
2032                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2033                            This hack is not just for fun, it allows
2034                            vic,vat and friends to work.
2035                            They bind socket to loopback, set ttl to zero
2036                            and expect that it will work.
2037                            From the viewpoint of routing cache they are broken,
2038                            because we are not allowed to build multicast path
2039                            with loopback source addr (look, routing cache
2040                            cannot know, that ttl is zero, so that packet
2041                            will not leave this host and route is valid).
2042                            Luckily, this hack is good workaround.
2043                          */
2044
2045                         fl4->flowi4_oif = dev_out->ifindex;
2046                         goto make_route;
2047                 }
2048
2049                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2050                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2051                         if (!__ip_dev_find(net, fl4->saddr, false))
2052                                 goto out;
2053                 }
2054         }
2055
2056
2057         if (fl4->flowi4_oif) {
2058                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2059                 rth = ERR_PTR(-ENODEV);
2060                 if (dev_out == NULL)
2061                         goto out;
2062
2063                 /* RACE: Check return value of inet_select_addr instead. */
2064                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2065                         rth = ERR_PTR(-ENETUNREACH);
2066                         goto out;
2067                 }
2068                 if (ipv4_is_local_multicast(fl4->daddr) ||
2069                     ipv4_is_lbcast(fl4->daddr)) {
2070                         if (!fl4->saddr)
2071                                 fl4->saddr = inet_select_addr(dev_out, 0,
2072                                                               RT_SCOPE_LINK);
2073                         goto make_route;
2074                 }
2075                 if (fl4->saddr) {
2076                         if (ipv4_is_multicast(fl4->daddr))
2077                                 fl4->saddr = inet_select_addr(dev_out, 0,
2078                                                               fl4->flowi4_scope);
2079                         else if (!fl4->daddr)
2080                                 fl4->saddr = inet_select_addr(dev_out, 0,
2081                                                               RT_SCOPE_HOST);
2082                 }
2083         }
2084
2085         if (!fl4->daddr) {
2086                 fl4->daddr = fl4->saddr;
2087                 if (!fl4->daddr)
2088                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2089                 dev_out = net->loopback_dev;
2090                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2091                 res.type = RTN_LOCAL;
2092                 flags |= RTCF_LOCAL;
2093                 goto make_route;
2094         }
2095
2096         if (fib_lookup(net, fl4, &res)) {
2097                 res.fi = NULL;
2098                 res.table = NULL;
2099                 if (fl4->flowi4_oif) {
2100                         /* Apparently, routing tables are wrong. Assume,
2101                            that the destination is on link.
2102
2103                            WHY? DW.
2104                            Because we are allowed to send to iface
2105                            even if it has NO routes and NO assigned
2106                            addresses. When oif is specified, routing
2107                            tables are looked up with only one purpose:
2108                            to catch if destination is gatewayed, rather than
2109                            direct. Moreover, if MSG_DONTROUTE is set,
2110                            we send packet, ignoring both routing tables
2111                            and ifaddr state. --ANK
2112
2113
2114                            We could make it even if oif is unknown,
2115                            likely IPv6, but we do not.
2116                          */
2117
2118                         if (fl4->saddr == 0)
2119                                 fl4->saddr = inet_select_addr(dev_out, 0,
2120                                                               RT_SCOPE_LINK);
2121                         res.type = RTN_UNICAST;
2122                         goto make_route;
2123                 }
2124                 rth = ERR_PTR(-ENETUNREACH);
2125                 goto out;
2126         }
2127
2128         if (res.type == RTN_LOCAL) {
2129                 if (!fl4->saddr) {
2130                         if (res.fi->fib_prefsrc)
2131                                 fl4->saddr = res.fi->fib_prefsrc;
2132                         else
2133                                 fl4->saddr = fl4->daddr;
2134                 }
2135                 dev_out = net->loopback_dev;
2136                 fl4->flowi4_oif = dev_out->ifindex;
2137                 flags |= RTCF_LOCAL;
2138                 goto make_route;
2139         }
2140
2141 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2142         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2143                 fib_select_multipath(&res);
2144         else
2145 #endif
2146         if (!res.prefixlen &&
2147             res.table->tb_num_default > 1 &&
2148             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2149                 fib_select_default(&res);
2150
2151         if (!fl4->saddr)
2152                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2153
2154         dev_out = FIB_RES_DEV(res);
2155         fl4->flowi4_oif = dev_out->ifindex;
2156
2157
2158 make_route:
2159         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2160
2161 out:
2162         rcu_read_unlock();
2163         return rth;
2164 }
2165 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2166
2167 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2168 {
2169         return NULL;
2170 }
2171
2172 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2173 {
2174         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2175
2176         return mtu ? : dst->dev->mtu;
2177 }
2178
2179 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2180                                           struct sk_buff *skb, u32 mtu)
2181 {
2182 }
2183
2184 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2185                                        struct sk_buff *skb)
2186 {
2187 }
2188
2189 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2190                                           unsigned long old)
2191 {
2192         return NULL;
2193 }
2194
2195 static struct dst_ops ipv4_dst_blackhole_ops = {
2196         .family                 =       AF_INET,
2197         .protocol               =       cpu_to_be16(ETH_P_IP),
2198         .check                  =       ipv4_blackhole_dst_check,
2199         .mtu                    =       ipv4_blackhole_mtu,
2200         .default_advmss         =       ipv4_default_advmss,
2201         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2202         .redirect               =       ipv4_rt_blackhole_redirect,
2203         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2204         .neigh_lookup           =       ipv4_neigh_lookup,
2205 };
2206
2207 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2208 {
2209         struct rtable *ort = (struct rtable *) dst_orig;
2210         struct rtable *rt;
2211
2212         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2213         if (rt) {
2214                 struct dst_entry *new = &rt->dst;
2215
2216                 new->__use = 1;
2217                 new->input = dst_discard;
2218                 new->output = dst_discard;
2219
2220                 new->dev = ort->dst.dev;
2221                 if (new->dev)
2222                         dev_hold(new->dev);
2223
2224                 rt->rt_is_input = ort->rt_is_input;
2225                 rt->rt_iif = ort->rt_iif;
2226                 rt->rt_pmtu = ort->rt_pmtu;
2227
2228                 rt->rt_genid = rt_genid_ipv4(net);
2229                 rt->rt_flags = ort->rt_flags;
2230                 rt->rt_type = ort->rt_type;
2231                 rt->rt_gateway = ort->rt_gateway;
2232                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2233
2234                 INIT_LIST_HEAD(&rt->rt_uncached);
2235
2236                 dst_free(new);
2237         }
2238
2239         dst_release(dst_orig);
2240
2241         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2242 }
2243
2244 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2245                                     struct sock *sk)
2246 {
2247         struct rtable *rt = __ip_route_output_key(net, flp4);
2248
2249         if (IS_ERR(rt))
2250                 return rt;
2251
2252         if (flp4->flowi4_proto)
2253                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2254                                                    flowi4_to_flowi(flp4),
2255                                                    sk, 0);
2256
2257         return rt;
2258 }
2259 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2260
2261 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2262                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2263                         u32 seq, int event, int nowait, unsigned int flags)
2264 {
2265         struct rtable *rt = skb_rtable(skb);
2266         struct rtmsg *r;
2267         struct nlmsghdr *nlh;
2268         unsigned long expires = 0;
2269         u32 error;
2270         u32 metrics[RTAX_MAX];
2271
2272         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2273         if (nlh == NULL)
2274                 return -EMSGSIZE;
2275
2276         r = nlmsg_data(nlh);
2277         r->rtm_family    = AF_INET;
2278         r->rtm_dst_len  = 32;
2279         r->rtm_src_len  = 0;
2280         r->rtm_tos      = fl4->flowi4_tos;
2281         r->rtm_table    = RT_TABLE_MAIN;
2282         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2283                 goto nla_put_failure;
2284         r->rtm_type     = rt->rt_type;
2285         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2286         r->rtm_protocol = RTPROT_UNSPEC;
2287         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2288         if (rt->rt_flags & RTCF_NOTIFY)
2289                 r->rtm_flags |= RTM_F_NOTIFY;
2290
2291         if (nla_put_be32(skb, RTA_DST, dst))
2292                 goto nla_put_failure;
2293         if (src) {
2294                 r->rtm_src_len = 32;
2295                 if (nla_put_be32(skb, RTA_SRC, src))
2296                         goto nla_put_failure;
2297         }
2298         if (rt->dst.dev &&
2299             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2300                 goto nla_put_failure;
2301 #ifdef CONFIG_IP_ROUTE_CLASSID
2302         if (rt->dst.tclassid &&
2303             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2304                 goto nla_put_failure;
2305 #endif
2306         if (!rt_is_input_route(rt) &&
2307             fl4->saddr != src) {
2308                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2309                         goto nla_put_failure;
2310         }
2311         if (rt->rt_uses_gateway &&
2312             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2313                 goto nla_put_failure;
2314
2315         expires = rt->dst.expires;
2316         if (expires) {
2317                 unsigned long now = jiffies;
2318
2319                 if (time_before(now, expires))
2320                         expires -= now;
2321                 else
2322                         expires = 0;
2323         }
2324
2325         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2326         if (rt->rt_pmtu && expires)
2327                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2328         if (rtnetlink_put_metrics(skb, metrics) < 0)
2329                 goto nla_put_failure;
2330
2331         if (fl4->flowi4_mark &&
2332             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2333                 goto nla_put_failure;
2334
2335         error = rt->dst.error;
2336
2337         if (rt_is_input_route(rt)) {
2338 #ifdef CONFIG_IP_MROUTE
2339                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2340                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2341                         int err = ipmr_get_route(net, skb,
2342                                                  fl4->saddr, fl4->daddr,
2343                                                  r, nowait);
2344                         if (err <= 0) {
2345                                 if (!nowait) {
2346                                         if (err == 0)
2347                                                 return 0;
2348                                         goto nla_put_failure;
2349                                 } else {
2350                                         if (err == -EMSGSIZE)
2351                                                 goto nla_put_failure;
2352                                         error = err;
2353                                 }
2354                         }
2355                 } else
2356 #endif
2357                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2358                                 goto nla_put_failure;
2359         }
2360
2361         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2362                 goto nla_put_failure;
2363
2364         return nlmsg_end(skb, nlh);
2365
2366 nla_put_failure:
2367         nlmsg_cancel(skb, nlh);
2368         return -EMSGSIZE;
2369 }
2370
2371 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2372 {
2373         struct net *net = sock_net(in_skb->sk);
2374         struct rtmsg *rtm;
2375         struct nlattr *tb[RTA_MAX+1];
2376         struct rtable *rt = NULL;
2377         struct flowi4 fl4;
2378         __be32 dst = 0;
2379         __be32 src = 0;
2380         u32 iif;
2381         int err;
2382         int mark;
2383         struct sk_buff *skb;
2384
2385         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2386         if (err < 0)
2387                 goto errout;
2388
2389         rtm = nlmsg_data(nlh);
2390
2391         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2392         if (skb == NULL) {
2393                 err = -ENOBUFS;
2394                 goto errout;
2395         }
2396
2397         /* Reserve room for dummy headers, this skb can pass
2398            through good chunk of routing engine.
2399          */
2400         skb_reset_mac_header(skb);
2401         skb_reset_network_header(skb);
2402
2403         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2404         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2405         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2406
2407         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2408         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2409         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2410         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2411
2412         memset(&fl4, 0, sizeof(fl4));
2413         fl4.daddr = dst;
2414         fl4.saddr = src;
2415         fl4.flowi4_tos = rtm->rtm_tos;
2416         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2417         fl4.flowi4_mark = mark;
2418
2419         if (iif) {
2420                 struct net_device *dev;
2421
2422                 dev = __dev_get_by_index(net, iif);
2423                 if (dev == NULL) {
2424                         err = -ENODEV;
2425                         goto errout_free;
2426                 }
2427
2428                 skb->protocol   = htons(ETH_P_IP);
2429                 skb->dev        = dev;
2430                 skb->mark       = mark;
2431                 local_bh_disable();
2432                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2433                 local_bh_enable();
2434
2435                 rt = skb_rtable(skb);
2436                 if (err == 0 && rt->dst.error)
2437                         err = -rt->dst.error;
2438         } else {
2439                 rt = ip_route_output_key(net, &fl4);
2440
2441                 err = 0;
2442                 if (IS_ERR(rt))
2443                         err = PTR_ERR(rt);
2444         }
2445
2446         if (err)
2447                 goto errout_free;
2448
2449         skb_dst_set(skb, &rt->dst);
2450         if (rtm->rtm_flags & RTM_F_NOTIFY)
2451                 rt->rt_flags |= RTCF_NOTIFY;
2452
2453         err = rt_fill_info(net, dst, src, &fl4, skb,
2454                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2455                            RTM_NEWROUTE, 0, 0);
2456         if (err <= 0)
2457                 goto errout_free;
2458
2459         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2460 errout:
2461         return err;
2462
2463 errout_free:
2464         kfree_skb(skb);
2465         goto errout;
2466 }
2467
2468 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2469 {
2470         return skb->len;
2471 }
2472
2473 void ip_rt_multicast_event(struct in_device *in_dev)
2474 {
2475         rt_cache_flush(dev_net(in_dev->dev));
2476 }
2477
2478 #ifdef CONFIG_SYSCTL
2479 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2480 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2481 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2482 static int ip_rt_gc_elasticity __read_mostly    = 8;
2483
2484 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2485                                         void __user *buffer,
2486                                         size_t *lenp, loff_t *ppos)
2487 {
2488         struct net *net = (struct net *)__ctl->extra1;
2489
2490         if (write) {
2491                 rt_cache_flush(net);
2492                 fnhe_genid_bump(net);
2493                 return 0;
2494         }
2495
2496         return -EINVAL;
2497 }
2498
2499 static struct ctl_table ipv4_route_table[] = {
2500         {
2501                 .procname       = "gc_thresh",
2502                 .data           = &ipv4_dst_ops.gc_thresh,
2503                 .maxlen         = sizeof(int),
2504                 .mode           = 0644,
2505                 .proc_handler   = proc_dointvec,
2506         },
2507         {
2508                 .procname       = "max_size",
2509                 .data           = &ip_rt_max_size,
2510                 .maxlen         = sizeof(int),
2511                 .mode           = 0644,
2512                 .proc_handler   = proc_dointvec,
2513         },
2514         {
2515                 /*  Deprecated. Use gc_min_interval_ms */
2516
2517                 .procname       = "gc_min_interval",
2518                 .data           = &ip_rt_gc_min_interval,
2519                 .maxlen         = sizeof(int),
2520                 .mode           = 0644,
2521                 .proc_handler   = proc_dointvec_jiffies,
2522         },
2523         {
2524                 .procname       = "gc_min_interval_ms",
2525                 .data           = &ip_rt_gc_min_interval,
2526                 .maxlen         = sizeof(int),
2527                 .mode           = 0644,
2528                 .proc_handler   = proc_dointvec_ms_jiffies,
2529         },
2530         {
2531                 .procname       = "gc_timeout",
2532                 .data           = &ip_rt_gc_timeout,
2533                 .maxlen         = sizeof(int),
2534                 .mode           = 0644,
2535                 .proc_handler   = proc_dointvec_jiffies,
2536         },
2537         {
2538                 .procname       = "gc_interval",
2539                 .data           = &ip_rt_gc_interval,
2540                 .maxlen         = sizeof(int),
2541                 .mode           = 0644,
2542                 .proc_handler   = proc_dointvec_jiffies,
2543         },
2544         {
2545                 .procname       = "redirect_load",
2546                 .data           = &ip_rt_redirect_load,
2547                 .maxlen         = sizeof(int),
2548                 .mode           = 0644,
2549                 .proc_handler   = proc_dointvec,
2550         },
2551         {
2552                 .procname       = "redirect_number",
2553                 .data           = &ip_rt_redirect_number,
2554                 .maxlen         = sizeof(int),
2555                 .mode           = 0644,
2556                 .proc_handler   = proc_dointvec,
2557         },
2558         {
2559                 .procname       = "redirect_silence",
2560                 .data           = &ip_rt_redirect_silence,
2561                 .maxlen         = sizeof(int),
2562                 .mode           = 0644,
2563                 .proc_handler   = proc_dointvec,
2564         },
2565         {
2566                 .procname       = "error_cost",
2567                 .data           = &ip_rt_error_cost,
2568                 .maxlen         = sizeof(int),
2569                 .mode           = 0644,
2570                 .proc_handler   = proc_dointvec,
2571         },
2572         {
2573                 .procname       = "error_burst",
2574                 .data           = &ip_rt_error_burst,
2575                 .maxlen         = sizeof(int),
2576                 .mode           = 0644,
2577                 .proc_handler   = proc_dointvec,
2578         },
2579         {
2580                 .procname       = "gc_elasticity",
2581                 .data           = &ip_rt_gc_elasticity,
2582                 .maxlen         = sizeof(int),
2583                 .mode           = 0644,
2584                 .proc_handler   = proc_dointvec,
2585         },
2586         {
2587                 .procname       = "mtu_expires",
2588                 .data           = &ip_rt_mtu_expires,
2589                 .maxlen         = sizeof(int),
2590                 .mode           = 0644,
2591                 .proc_handler   = proc_dointvec_jiffies,
2592         },
2593         {
2594                 .procname       = "min_pmtu",
2595                 .data           = &ip_rt_min_pmtu,
2596                 .maxlen         = sizeof(int),
2597                 .mode           = 0644,
2598                 .proc_handler   = proc_dointvec,
2599         },
2600         {
2601                 .procname       = "min_adv_mss",
2602                 .data           = &ip_rt_min_advmss,
2603                 .maxlen         = sizeof(int),
2604                 .mode           = 0644,
2605                 .proc_handler   = proc_dointvec,
2606         },
2607         { }
2608 };
2609
2610 static struct ctl_table ipv4_route_flush_table[] = {
2611         {
2612                 .procname       = "flush",
2613                 .maxlen         = sizeof(int),
2614                 .mode           = 0200,
2615                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2616         },
2617         { },
2618 };
2619
2620 static __net_init int sysctl_route_net_init(struct net *net)
2621 {
2622         struct ctl_table *tbl;
2623
2624         tbl = ipv4_route_flush_table;
2625         if (!net_eq(net, &init_net)) {
2626                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2627                 if (tbl == NULL)
2628                         goto err_dup;
2629
2630                 /* Don't export sysctls to unprivileged users */
2631                 if (net->user_ns != &init_user_ns)
2632                         tbl[0].procname = NULL;
2633         }
2634         tbl[0].extra1 = net;
2635
2636         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2637         if (net->ipv4.route_hdr == NULL)
2638                 goto err_reg;
2639         return 0;
2640
2641 err_reg:
2642         if (tbl != ipv4_route_flush_table)
2643                 kfree(tbl);
2644 err_dup:
2645         return -ENOMEM;
2646 }
2647
2648 static __net_exit void sysctl_route_net_exit(struct net *net)
2649 {
2650         struct ctl_table *tbl;
2651
2652         tbl = net->ipv4.route_hdr->ctl_table_arg;
2653         unregister_net_sysctl_table(net->ipv4.route_hdr);
2654         BUG_ON(tbl == ipv4_route_flush_table);
2655         kfree(tbl);
2656 }
2657
2658 static __net_initdata struct pernet_operations sysctl_route_ops = {
2659         .init = sysctl_route_net_init,
2660         .exit = sysctl_route_net_exit,
2661 };
2662 #endif
2663
2664 static __net_init int rt_genid_init(struct net *net)
2665 {
2666         atomic_set(&net->ipv4.rt_genid, 0);
2667         atomic_set(&net->fnhe_genid, 0);
2668         get_random_bytes(&net->ipv4.dev_addr_genid,
2669                          sizeof(net->ipv4.dev_addr_genid));
2670         return 0;
2671 }
2672
2673 static __net_initdata struct pernet_operations rt_genid_ops = {
2674         .init = rt_genid_init,
2675 };
2676
2677 static int __net_init ipv4_inetpeer_init(struct net *net)
2678 {
2679         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2680
2681         if (!bp)
2682                 return -ENOMEM;
2683         inet_peer_base_init(bp);
2684         net->ipv4.peers = bp;
2685         return 0;
2686 }
2687
2688 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2689 {
2690         struct inet_peer_base *bp = net->ipv4.peers;
2691
2692         net->ipv4.peers = NULL;
2693         inetpeer_invalidate_tree(bp);
2694         kfree(bp);
2695 }
2696
2697 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2698         .init   =       ipv4_inetpeer_init,
2699         .exit   =       ipv4_inetpeer_exit,
2700 };
2701
2702 #ifdef CONFIG_IP_ROUTE_CLASSID
2703 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2704 #endif /* CONFIG_IP_ROUTE_CLASSID */
2705
2706 int __init ip_rt_init(void)
2707 {
2708         int rc = 0;
2709
2710 #ifdef CONFIG_IP_ROUTE_CLASSID
2711         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2712         if (!ip_rt_acct)
2713                 panic("IP: failed to allocate ip_rt_acct\n");
2714 #endif
2715
2716         ipv4_dst_ops.kmem_cachep =
2717                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2718                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2719
2720         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2721
2722         if (dst_entries_init(&ipv4_dst_ops) < 0)
2723                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2724
2725         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2726                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2727
2728         ipv4_dst_ops.gc_thresh = ~0;
2729         ip_rt_max_size = INT_MAX;
2730
2731         devinet_init();
2732         ip_fib_init();
2733
2734         if (ip_rt_proc_init())
2735                 pr_err("Unable to create route proc files\n");
2736 #ifdef CONFIG_XFRM
2737         xfrm_init();
2738         xfrm4_init();
2739 #endif
2740         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2741
2742 #ifdef CONFIG_SYSCTL
2743         register_pernet_subsys(&sysctl_route_ops);
2744 #endif
2745         register_pernet_subsys(&rt_genid_ops);
2746         register_pernet_subsys(&ipv4_inetpeer_ops);
2747         return rc;
2748 }
2749
2750 #ifdef CONFIG_SYSCTL
2751 /*
2752  * We really need to sanitize the damn ipv4 init order, then all
2753  * this nonsense will go away.
2754  */
2755 void __init ip_static_sysctl_init(void)
2756 {
2757         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2758 }
2759 #endif