]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv4/route.c
ipv4: restore rt->fi for reference counting
[karo-tx-linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116
117 #define RT_FL_TOS(oldflp4) \
118         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
119
120 #define RT_GC_TIMEOUT (300*HZ)
121
122 static int ip_rt_max_size;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly       = 256;
131
132 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
133 /*
134  *      Interface to generic destination cache.
135  */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void              ipv4_link_failure(struct sk_buff *skb);
142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143                                            struct sk_buff *skb, u32 mtu);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150         WARN_ON(1);
151         return NULL;
152 }
153
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155                                            struct sk_buff *skb,
156                                            const void *daddr);
157 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
158
159 static struct dst_ops ipv4_dst_ops = {
160         .family =               AF_INET,
161         .check =                ipv4_dst_check,
162         .default_advmss =       ipv4_default_advmss,
163         .mtu =                  ipv4_mtu,
164         .cow_metrics =          ipv4_cow_metrics,
165         .destroy =              ipv4_dst_destroy,
166         .negative_advice =      ipv4_negative_advice,
167         .link_failure =         ipv4_link_failure,
168         .update_pmtu =          ip_rt_update_pmtu,
169         .redirect =             ip_do_redirect,
170         .local_out =            __ip_local_out,
171         .neigh_lookup =         ipv4_neigh_lookup,
172         .confirm_neigh =        ipv4_confirm_neigh,
173 };
174
175 #define ECN_OR_COST(class)      TC_PRIO_##class
176
177 const __u8 ip_tos2prio[16] = {
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK)
194 };
195 EXPORT_SYMBOL(ip_tos2prio);
196
197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
199
200 #ifdef CONFIG_PROC_FS
201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
202 {
203         if (*pos)
204                 return NULL;
205         return SEQ_START_TOKEN;
206 }
207
208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209 {
210         ++*pos;
211         return NULL;
212 }
213
214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
215 {
216 }
217
218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
219 {
220         if (v == SEQ_START_TOKEN)
221                 seq_printf(seq, "%-127s\n",
222                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
223                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
224                            "HHUptod\tSpecDst");
225         return 0;
226 }
227
228 static const struct seq_operations rt_cache_seq_ops = {
229         .start  = rt_cache_seq_start,
230         .next   = rt_cache_seq_next,
231         .stop   = rt_cache_seq_stop,
232         .show   = rt_cache_seq_show,
233 };
234
235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
236 {
237         return seq_open(file, &rt_cache_seq_ops);
238 }
239
240 static const struct file_operations rt_cache_seq_fops = {
241         .owner   = THIS_MODULE,
242         .open    = rt_cache_seq_open,
243         .read    = seq_read,
244         .llseek  = seq_lseek,
245         .release = seq_release,
246 };
247
248
249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
250 {
251         int cpu;
252
253         if (*pos == 0)
254                 return SEQ_START_TOKEN;
255
256         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257                 if (!cpu_possible(cpu))
258                         continue;
259                 *pos = cpu+1;
260                 return &per_cpu(rt_cache_stat, cpu);
261         }
262         return NULL;
263 }
264
265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266 {
267         int cpu;
268
269         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270                 if (!cpu_possible(cpu))
271                         continue;
272                 *pos = cpu+1;
273                 return &per_cpu(rt_cache_stat, cpu);
274         }
275         return NULL;
276
277 }
278
279 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
280 {
281
282 }
283
284 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
285 {
286         struct rt_cache_stat *st = v;
287
288         if (v == SEQ_START_TOKEN) {
289                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
290                 return 0;
291         }
292
293         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
294                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
295                    dst_entries_get_slow(&ipv4_dst_ops),
296                    0, /* st->in_hit */
297                    st->in_slow_tot,
298                    st->in_slow_mc,
299                    st->in_no_route,
300                    st->in_brd,
301                    st->in_martian_dst,
302                    st->in_martian_src,
303
304                    0, /* st->out_hit */
305                    st->out_slow_tot,
306                    st->out_slow_mc,
307
308                    0, /* st->gc_total */
309                    0, /* st->gc_ignored */
310                    0, /* st->gc_goal_miss */
311                    0, /* st->gc_dst_overflow */
312                    0, /* st->in_hlist_search */
313                    0  /* st->out_hlist_search */
314                 );
315         return 0;
316 }
317
318 static const struct seq_operations rt_cpu_seq_ops = {
319         .start  = rt_cpu_seq_start,
320         .next   = rt_cpu_seq_next,
321         .stop   = rt_cpu_seq_stop,
322         .show   = rt_cpu_seq_show,
323 };
324
325
326 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
327 {
328         return seq_open(file, &rt_cpu_seq_ops);
329 }
330
331 static const struct file_operations rt_cpu_seq_fops = {
332         .owner   = THIS_MODULE,
333         .open    = rt_cpu_seq_open,
334         .read    = seq_read,
335         .llseek  = seq_lseek,
336         .release = seq_release,
337 };
338
339 #ifdef CONFIG_IP_ROUTE_CLASSID
340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342         struct ip_rt_acct *dst, *src;
343         unsigned int i, j;
344
345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346         if (!dst)
347                 return -ENOMEM;
348
349         for_each_possible_cpu(i) {
350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351                 for (j = 0; j < 256; j++) {
352                         dst[j].o_bytes   += src[j].o_bytes;
353                         dst[j].o_packets += src[j].o_packets;
354                         dst[j].i_bytes   += src[j].i_bytes;
355                         dst[j].i_packets += src[j].i_packets;
356                 }
357         }
358
359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360         kfree(dst);
361         return 0;
362 }
363
364 static int rt_acct_proc_open(struct inode *inode, struct file *file)
365 {
366         return single_open(file, rt_acct_proc_show, NULL);
367 }
368
369 static const struct file_operations rt_acct_proc_fops = {
370         .owner          = THIS_MODULE,
371         .open           = rt_acct_proc_open,
372         .read           = seq_read,
373         .llseek         = seq_lseek,
374         .release        = single_release,
375 };
376 #endif
377
378 static int __net_init ip_rt_do_proc_init(struct net *net)
379 {
380         struct proc_dir_entry *pde;
381
382         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
383                           &rt_cache_seq_fops);
384         if (!pde)
385                 goto err1;
386
387         pde = proc_create("rt_cache", S_IRUGO,
388                           net->proc_net_stat, &rt_cpu_seq_fops);
389         if (!pde)
390                 goto err2;
391
392 #ifdef CONFIG_IP_ROUTE_CLASSID
393         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
394         if (!pde)
395                 goto err3;
396 #endif
397         return 0;
398
399 #ifdef CONFIG_IP_ROUTE_CLASSID
400 err3:
401         remove_proc_entry("rt_cache", net->proc_net_stat);
402 #endif
403 err2:
404         remove_proc_entry("rt_cache", net->proc_net);
405 err1:
406         return -ENOMEM;
407 }
408
409 static void __net_exit ip_rt_do_proc_exit(struct net *net)
410 {
411         remove_proc_entry("rt_cache", net->proc_net_stat);
412         remove_proc_entry("rt_cache", net->proc_net);
413 #ifdef CONFIG_IP_ROUTE_CLASSID
414         remove_proc_entry("rt_acct", net->proc_net);
415 #endif
416 }
417
418 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
419         .init = ip_rt_do_proc_init,
420         .exit = ip_rt_do_proc_exit,
421 };
422
423 static int __init ip_rt_proc_init(void)
424 {
425         return register_pernet_subsys(&ip_rt_proc_ops);
426 }
427
428 #else
429 static inline int ip_rt_proc_init(void)
430 {
431         return 0;
432 }
433 #endif /* CONFIG_PROC_FS */
434
435 static inline bool rt_is_expired(const struct rtable *rth)
436 {
437         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
438 }
439
440 void rt_cache_flush(struct net *net)
441 {
442         rt_genid_bump_ipv4(net);
443 }
444
445 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
446                                            struct sk_buff *skb,
447                                            const void *daddr)
448 {
449         struct net_device *dev = dst->dev;
450         const __be32 *pkey = daddr;
451         const struct rtable *rt;
452         struct neighbour *n;
453
454         rt = (const struct rtable *) dst;
455         if (rt->rt_gateway)
456                 pkey = (const __be32 *) &rt->rt_gateway;
457         else if (skb)
458                 pkey = &ip_hdr(skb)->daddr;
459
460         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
461         if (n)
462                 return n;
463         return neigh_create(&arp_tbl, pkey, dev);
464 }
465
466 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
467 {
468         struct net_device *dev = dst->dev;
469         const __be32 *pkey = daddr;
470         const struct rtable *rt;
471
472         rt = (const struct rtable *)dst;
473         if (rt->rt_gateway)
474                 pkey = (const __be32 *)&rt->rt_gateway;
475         else if (!daddr ||
476                  (rt->rt_flags &
477                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
478                 return;
479
480         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
481 }
482
483 #define IP_IDENTS_SZ 2048u
484
485 static atomic_t *ip_idents __read_mostly;
486 static u32 *ip_tstamps __read_mostly;
487
488 /* In order to protect privacy, we add a perturbation to identifiers
489  * if one generator is seldom used. This makes hard for an attacker
490  * to infer how many packets were sent between two points in time.
491  */
492 u32 ip_idents_reserve(u32 hash, int segs)
493 {
494         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
495         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
496         u32 old = ACCESS_ONCE(*p_tstamp);
497         u32 now = (u32)jiffies;
498         u32 new, delta = 0;
499
500         if (old != now && cmpxchg(p_tstamp, old, now) == old)
501                 delta = prandom_u32_max(now - old);
502
503         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
504         do {
505                 old = (u32)atomic_read(p_id);
506                 new = old + delta + segs;
507         } while (atomic_cmpxchg(p_id, old, new) != old);
508
509         return new - segs;
510 }
511 EXPORT_SYMBOL(ip_idents_reserve);
512
513 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
514 {
515         static u32 ip_idents_hashrnd __read_mostly;
516         u32 hash, id;
517
518         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
519
520         hash = jhash_3words((__force u32)iph->daddr,
521                             (__force u32)iph->saddr,
522                             iph->protocol ^ net_hash_mix(net),
523                             ip_idents_hashrnd);
524         id = ip_idents_reserve(hash, segs);
525         iph->id = htons(id);
526 }
527 EXPORT_SYMBOL(__ip_select_ident);
528
529 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
530                              const struct sock *sk,
531                              const struct iphdr *iph,
532                              int oif, u8 tos,
533                              u8 prot, u32 mark, int flow_flags)
534 {
535         if (sk) {
536                 const struct inet_sock *inet = inet_sk(sk);
537
538                 oif = sk->sk_bound_dev_if;
539                 mark = sk->sk_mark;
540                 tos = RT_CONN_FLAGS(sk);
541                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
542         }
543         flowi4_init_output(fl4, oif, mark, tos,
544                            RT_SCOPE_UNIVERSE, prot,
545                            flow_flags,
546                            iph->daddr, iph->saddr, 0, 0,
547                            sock_net_uid(net, sk));
548 }
549
550 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
551                                const struct sock *sk)
552 {
553         const struct net *net = dev_net(skb->dev);
554         const struct iphdr *iph = ip_hdr(skb);
555         int oif = skb->dev->ifindex;
556         u8 tos = RT_TOS(iph->tos);
557         u8 prot = iph->protocol;
558         u32 mark = skb->mark;
559
560         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
561 }
562
563 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
564 {
565         const struct inet_sock *inet = inet_sk(sk);
566         const struct ip_options_rcu *inet_opt;
567         __be32 daddr = inet->inet_daddr;
568
569         rcu_read_lock();
570         inet_opt = rcu_dereference(inet->inet_opt);
571         if (inet_opt && inet_opt->opt.srr)
572                 daddr = inet_opt->opt.faddr;
573         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
574                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
575                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
576                            inet_sk_flowi_flags(sk),
577                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
578         rcu_read_unlock();
579 }
580
581 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
582                                  const struct sk_buff *skb)
583 {
584         if (skb)
585                 build_skb_flow_key(fl4, skb, sk);
586         else
587                 build_sk_flow_key(fl4, sk);
588 }
589
590 static inline void rt_free(struct rtable *rt)
591 {
592         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
593 }
594
595 static DEFINE_SPINLOCK(fnhe_lock);
596
597 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
598 {
599         struct rtable *rt;
600
601         rt = rcu_dereference(fnhe->fnhe_rth_input);
602         if (rt) {
603                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
604                 rt_free(rt);
605         }
606         rt = rcu_dereference(fnhe->fnhe_rth_output);
607         if (rt) {
608                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
609                 rt_free(rt);
610         }
611 }
612
613 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
614 {
615         struct fib_nh_exception *fnhe, *oldest;
616
617         oldest = rcu_dereference(hash->chain);
618         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
619              fnhe = rcu_dereference(fnhe->fnhe_next)) {
620                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
621                         oldest = fnhe;
622         }
623         fnhe_flush_routes(oldest);
624         return oldest;
625 }
626
627 static inline u32 fnhe_hashfun(__be32 daddr)
628 {
629         static u32 fnhe_hashrnd __read_mostly;
630         u32 hval;
631
632         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
633         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
634         return hash_32(hval, FNHE_HASH_SHIFT);
635 }
636
637 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
638 {
639         rt->rt_pmtu = fnhe->fnhe_pmtu;
640         rt->dst.expires = fnhe->fnhe_expires;
641
642         if (fnhe->fnhe_gw) {
643                 rt->rt_flags |= RTCF_REDIRECTED;
644                 rt->rt_gateway = fnhe->fnhe_gw;
645                 rt->rt_uses_gateway = 1;
646         }
647 }
648
649 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
650                                   u32 pmtu, unsigned long expires)
651 {
652         struct fnhe_hash_bucket *hash;
653         struct fib_nh_exception *fnhe;
654         struct rtable *rt;
655         unsigned int i;
656         int depth;
657         u32 hval = fnhe_hashfun(daddr);
658
659         spin_lock_bh(&fnhe_lock);
660
661         hash = rcu_dereference(nh->nh_exceptions);
662         if (!hash) {
663                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
664                 if (!hash)
665                         goto out_unlock;
666                 rcu_assign_pointer(nh->nh_exceptions, hash);
667         }
668
669         hash += hval;
670
671         depth = 0;
672         for (fnhe = rcu_dereference(hash->chain); fnhe;
673              fnhe = rcu_dereference(fnhe->fnhe_next)) {
674                 if (fnhe->fnhe_daddr == daddr)
675                         break;
676                 depth++;
677         }
678
679         if (fnhe) {
680                 if (gw)
681                         fnhe->fnhe_gw = gw;
682                 if (pmtu) {
683                         fnhe->fnhe_pmtu = pmtu;
684                         fnhe->fnhe_expires = max(1UL, expires);
685                 }
686                 /* Update all cached dsts too */
687                 rt = rcu_dereference(fnhe->fnhe_rth_input);
688                 if (rt)
689                         fill_route_from_fnhe(rt, fnhe);
690                 rt = rcu_dereference(fnhe->fnhe_rth_output);
691                 if (rt)
692                         fill_route_from_fnhe(rt, fnhe);
693         } else {
694                 if (depth > FNHE_RECLAIM_DEPTH)
695                         fnhe = fnhe_oldest(hash);
696                 else {
697                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
698                         if (!fnhe)
699                                 goto out_unlock;
700
701                         fnhe->fnhe_next = hash->chain;
702                         rcu_assign_pointer(hash->chain, fnhe);
703                 }
704                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
705                 fnhe->fnhe_daddr = daddr;
706                 fnhe->fnhe_gw = gw;
707                 fnhe->fnhe_pmtu = pmtu;
708                 fnhe->fnhe_expires = expires;
709
710                 /* Exception created; mark the cached routes for the nexthop
711                  * stale, so anyone caching it rechecks if this exception
712                  * applies to them.
713                  */
714                 rt = rcu_dereference(nh->nh_rth_input);
715                 if (rt)
716                         rt->dst.obsolete = DST_OBSOLETE_KILL;
717
718                 for_each_possible_cpu(i) {
719                         struct rtable __rcu **prt;
720                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
721                         rt = rcu_dereference(*prt);
722                         if (rt)
723                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
724                 }
725         }
726
727         fnhe->fnhe_stamp = jiffies;
728
729 out_unlock:
730         spin_unlock_bh(&fnhe_lock);
731 }
732
733 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
734                              bool kill_route)
735 {
736         __be32 new_gw = icmp_hdr(skb)->un.gateway;
737         __be32 old_gw = ip_hdr(skb)->saddr;
738         struct net_device *dev = skb->dev;
739         struct in_device *in_dev;
740         struct fib_result res;
741         struct neighbour *n;
742         struct net *net;
743
744         switch (icmp_hdr(skb)->code & 7) {
745         case ICMP_REDIR_NET:
746         case ICMP_REDIR_NETTOS:
747         case ICMP_REDIR_HOST:
748         case ICMP_REDIR_HOSTTOS:
749                 break;
750
751         default:
752                 return;
753         }
754
755         if (rt->rt_gateway != old_gw)
756                 return;
757
758         in_dev = __in_dev_get_rcu(dev);
759         if (!in_dev)
760                 return;
761
762         net = dev_net(dev);
763         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
764             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
765             ipv4_is_zeronet(new_gw))
766                 goto reject_redirect;
767
768         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
769                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
770                         goto reject_redirect;
771                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
772                         goto reject_redirect;
773         } else {
774                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
775                         goto reject_redirect;
776         }
777
778         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
779         if (!n)
780                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
781         if (!IS_ERR(n)) {
782                 if (!(n->nud_state & NUD_VALID)) {
783                         neigh_event_send(n, NULL);
784                 } else {
785                         if (fib_lookup(net, fl4, &res, 0) == 0) {
786                                 struct fib_nh *nh = &FIB_RES_NH(res);
787
788                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
789                                                 0, jiffies + ip_rt_gc_timeout);
790                         }
791                         if (kill_route)
792                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
793                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
794                 }
795                 neigh_release(n);
796         }
797         return;
798
799 reject_redirect:
800 #ifdef CONFIG_IP_ROUTE_VERBOSE
801         if (IN_DEV_LOG_MARTIANS(in_dev)) {
802                 const struct iphdr *iph = (const struct iphdr *) skb->data;
803                 __be32 daddr = iph->daddr;
804                 __be32 saddr = iph->saddr;
805
806                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
807                                      "  Advised path = %pI4 -> %pI4\n",
808                                      &old_gw, dev->name, &new_gw,
809                                      &saddr, &daddr);
810         }
811 #endif
812         ;
813 }
814
815 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
816 {
817         struct rtable *rt;
818         struct flowi4 fl4;
819         const struct iphdr *iph = (const struct iphdr *) skb->data;
820         struct net *net = dev_net(skb->dev);
821         int oif = skb->dev->ifindex;
822         u8 tos = RT_TOS(iph->tos);
823         u8 prot = iph->protocol;
824         u32 mark = skb->mark;
825
826         rt = (struct rtable *) dst;
827
828         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
829         __ip_do_redirect(rt, skb, &fl4, true);
830 }
831
832 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
833 {
834         struct rtable *rt = (struct rtable *)dst;
835         struct dst_entry *ret = dst;
836
837         if (rt) {
838                 if (dst->obsolete > 0) {
839                         ip_rt_put(rt);
840                         ret = NULL;
841                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
842                            rt->dst.expires) {
843                         ip_rt_put(rt);
844                         ret = NULL;
845                 }
846         }
847         return ret;
848 }
849
850 /*
851  * Algorithm:
852  *      1. The first ip_rt_redirect_number redirects are sent
853  *         with exponential backoff, then we stop sending them at all,
854  *         assuming that the host ignores our redirects.
855  *      2. If we did not see packets requiring redirects
856  *         during ip_rt_redirect_silence, we assume that the host
857  *         forgot redirected route and start to send redirects again.
858  *
859  * This algorithm is much cheaper and more intelligent than dumb load limiting
860  * in icmp.c.
861  *
862  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
863  * and "frag. need" (breaks PMTU discovery) in icmp.c.
864  */
865
866 void ip_rt_send_redirect(struct sk_buff *skb)
867 {
868         struct rtable *rt = skb_rtable(skb);
869         struct in_device *in_dev;
870         struct inet_peer *peer;
871         struct net *net;
872         int log_martians;
873         int vif;
874
875         rcu_read_lock();
876         in_dev = __in_dev_get_rcu(rt->dst.dev);
877         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
878                 rcu_read_unlock();
879                 return;
880         }
881         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
882         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
883         rcu_read_unlock();
884
885         net = dev_net(rt->dst.dev);
886         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
887         if (!peer) {
888                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
889                           rt_nexthop(rt, ip_hdr(skb)->daddr));
890                 return;
891         }
892
893         /* No redirected packets during ip_rt_redirect_silence;
894          * reset the algorithm.
895          */
896         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
897                 peer->rate_tokens = 0;
898
899         /* Too many ignored redirects; do not send anything
900          * set dst.rate_last to the last seen redirected packet.
901          */
902         if (peer->rate_tokens >= ip_rt_redirect_number) {
903                 peer->rate_last = jiffies;
904                 goto out_put_peer;
905         }
906
907         /* Check for load limit; set rate_last to the latest sent
908          * redirect.
909          */
910         if (peer->rate_tokens == 0 ||
911             time_after(jiffies,
912                        (peer->rate_last +
913                         (ip_rt_redirect_load << peer->rate_tokens)))) {
914                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
915
916                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
917                 peer->rate_last = jiffies;
918                 ++peer->rate_tokens;
919 #ifdef CONFIG_IP_ROUTE_VERBOSE
920                 if (log_martians &&
921                     peer->rate_tokens == ip_rt_redirect_number)
922                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
923                                              &ip_hdr(skb)->saddr, inet_iif(skb),
924                                              &ip_hdr(skb)->daddr, &gw);
925 #endif
926         }
927 out_put_peer:
928         inet_putpeer(peer);
929 }
930
931 static int ip_error(struct sk_buff *skb)
932 {
933         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
934         struct rtable *rt = skb_rtable(skb);
935         struct inet_peer *peer;
936         unsigned long now;
937         struct net *net;
938         bool send;
939         int code;
940
941         /* IP on this device is disabled. */
942         if (!in_dev)
943                 goto out;
944
945         net = dev_net(rt->dst.dev);
946         if (!IN_DEV_FORWARD(in_dev)) {
947                 switch (rt->dst.error) {
948                 case EHOSTUNREACH:
949                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
950                         break;
951
952                 case ENETUNREACH:
953                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
954                         break;
955                 }
956                 goto out;
957         }
958
959         switch (rt->dst.error) {
960         case EINVAL:
961         default:
962                 goto out;
963         case EHOSTUNREACH:
964                 code = ICMP_HOST_UNREACH;
965                 break;
966         case ENETUNREACH:
967                 code = ICMP_NET_UNREACH;
968                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
969                 break;
970         case EACCES:
971                 code = ICMP_PKT_FILTERED;
972                 break;
973         }
974
975         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
976                                l3mdev_master_ifindex(skb->dev), 1);
977
978         send = true;
979         if (peer) {
980                 now = jiffies;
981                 peer->rate_tokens += now - peer->rate_last;
982                 if (peer->rate_tokens > ip_rt_error_burst)
983                         peer->rate_tokens = ip_rt_error_burst;
984                 peer->rate_last = now;
985                 if (peer->rate_tokens >= ip_rt_error_cost)
986                         peer->rate_tokens -= ip_rt_error_cost;
987                 else
988                         send = false;
989                 inet_putpeer(peer);
990         }
991         if (send)
992                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
993
994 out:    kfree_skb(skb);
995         return 0;
996 }
997
998 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
999 {
1000         struct dst_entry *dst = &rt->dst;
1001         struct fib_result res;
1002
1003         if (dst_metric_locked(dst, RTAX_MTU))
1004                 return;
1005
1006         if (ipv4_mtu(dst) < mtu)
1007                 return;
1008
1009         if (mtu < ip_rt_min_pmtu)
1010                 mtu = ip_rt_min_pmtu;
1011
1012         if (rt->rt_pmtu == mtu &&
1013             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1014                 return;
1015
1016         rcu_read_lock();
1017         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1018                 struct fib_nh *nh = &FIB_RES_NH(res);
1019
1020                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1021                                       jiffies + ip_rt_mtu_expires);
1022         }
1023         rcu_read_unlock();
1024 }
1025
1026 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1027                               struct sk_buff *skb, u32 mtu)
1028 {
1029         struct rtable *rt = (struct rtable *) dst;
1030         struct flowi4 fl4;
1031
1032         ip_rt_build_flow_key(&fl4, sk, skb);
1033         __ip_rt_update_pmtu(rt, &fl4, mtu);
1034 }
1035
1036 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1037                       int oif, u32 mark, u8 protocol, int flow_flags)
1038 {
1039         const struct iphdr *iph = (const struct iphdr *) skb->data;
1040         struct flowi4 fl4;
1041         struct rtable *rt;
1042
1043         if (!mark)
1044                 mark = IP4_REPLY_MARK(net, skb->mark);
1045
1046         __build_flow_key(net, &fl4, NULL, iph, oif,
1047                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1048         rt = __ip_route_output_key(net, &fl4);
1049         if (!IS_ERR(rt)) {
1050                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1051                 ip_rt_put(rt);
1052         }
1053 }
1054 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1055
1056 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1057 {
1058         const struct iphdr *iph = (const struct iphdr *) skb->data;
1059         struct flowi4 fl4;
1060         struct rtable *rt;
1061
1062         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1063
1064         if (!fl4.flowi4_mark)
1065                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1066
1067         rt = __ip_route_output_key(sock_net(sk), &fl4);
1068         if (!IS_ERR(rt)) {
1069                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1070                 ip_rt_put(rt);
1071         }
1072 }
1073
1074 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1075 {
1076         const struct iphdr *iph = (const struct iphdr *) skb->data;
1077         struct flowi4 fl4;
1078         struct rtable *rt;
1079         struct dst_entry *odst = NULL;
1080         bool new = false;
1081         struct net *net = sock_net(sk);
1082
1083         bh_lock_sock(sk);
1084
1085         if (!ip_sk_accept_pmtu(sk))
1086                 goto out;
1087
1088         odst = sk_dst_get(sk);
1089
1090         if (sock_owned_by_user(sk) || !odst) {
1091                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1092                 goto out;
1093         }
1094
1095         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1096
1097         rt = (struct rtable *)odst;
1098         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1099                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1100                 if (IS_ERR(rt))
1101                         goto out;
1102
1103                 new = true;
1104         }
1105
1106         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1107
1108         if (!dst_check(&rt->dst, 0)) {
1109                 if (new)
1110                         dst_release(&rt->dst);
1111
1112                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1113                 if (IS_ERR(rt))
1114                         goto out;
1115
1116                 new = true;
1117         }
1118
1119         if (new)
1120                 sk_dst_set(sk, &rt->dst);
1121
1122 out:
1123         bh_unlock_sock(sk);
1124         dst_release(odst);
1125 }
1126 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1127
1128 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1129                    int oif, u32 mark, u8 protocol, int flow_flags)
1130 {
1131         const struct iphdr *iph = (const struct iphdr *) skb->data;
1132         struct flowi4 fl4;
1133         struct rtable *rt;
1134
1135         __build_flow_key(net, &fl4, NULL, iph, oif,
1136                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1137         rt = __ip_route_output_key(net, &fl4);
1138         if (!IS_ERR(rt)) {
1139                 __ip_do_redirect(rt, skb, &fl4, false);
1140                 ip_rt_put(rt);
1141         }
1142 }
1143 EXPORT_SYMBOL_GPL(ipv4_redirect);
1144
1145 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1146 {
1147         const struct iphdr *iph = (const struct iphdr *) skb->data;
1148         struct flowi4 fl4;
1149         struct rtable *rt;
1150         struct net *net = sock_net(sk);
1151
1152         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1153         rt = __ip_route_output_key(net, &fl4);
1154         if (!IS_ERR(rt)) {
1155                 __ip_do_redirect(rt, skb, &fl4, false);
1156                 ip_rt_put(rt);
1157         }
1158 }
1159 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1160
1161 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1162 {
1163         struct rtable *rt = (struct rtable *) dst;
1164
1165         /* All IPV4 dsts are created with ->obsolete set to the value
1166          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1167          * into this function always.
1168          *
1169          * When a PMTU/redirect information update invalidates a route,
1170          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1171          * DST_OBSOLETE_DEAD by dst_free().
1172          */
1173         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1174                 return NULL;
1175         return dst;
1176 }
1177
1178 static void ipv4_link_failure(struct sk_buff *skb)
1179 {
1180         struct rtable *rt;
1181
1182         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1183
1184         rt = skb_rtable(skb);
1185         if (rt)
1186                 dst_set_expires(&rt->dst, 0);
1187 }
1188
1189 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1190 {
1191         pr_debug("%s: %pI4 -> %pI4, %s\n",
1192                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1193                  skb->dev ? skb->dev->name : "?");
1194         kfree_skb(skb);
1195         WARN_ON(1);
1196         return 0;
1197 }
1198
1199 /*
1200    We do not cache source address of outgoing interface,
1201    because it is used only by IP RR, TS and SRR options,
1202    so that it out of fast path.
1203
1204    BTW remember: "addr" is allowed to be not aligned
1205    in IP options!
1206  */
1207
1208 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1209 {
1210         __be32 src;
1211
1212         if (rt_is_output_route(rt))
1213                 src = ip_hdr(skb)->saddr;
1214         else {
1215                 struct fib_result res;
1216                 struct flowi4 fl4;
1217                 struct iphdr *iph;
1218
1219                 iph = ip_hdr(skb);
1220
1221                 memset(&fl4, 0, sizeof(fl4));
1222                 fl4.daddr = iph->daddr;
1223                 fl4.saddr = iph->saddr;
1224                 fl4.flowi4_tos = RT_TOS(iph->tos);
1225                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1226                 fl4.flowi4_iif = skb->dev->ifindex;
1227                 fl4.flowi4_mark = skb->mark;
1228
1229                 rcu_read_lock();
1230                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1231                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1232                 else
1233                         src = inet_select_addr(rt->dst.dev,
1234                                                rt_nexthop(rt, iph->daddr),
1235                                                RT_SCOPE_UNIVERSE);
1236                 rcu_read_unlock();
1237         }
1238         memcpy(addr, &src, 4);
1239 }
1240
1241 #ifdef CONFIG_IP_ROUTE_CLASSID
1242 static void set_class_tag(struct rtable *rt, u32 tag)
1243 {
1244         if (!(rt->dst.tclassid & 0xFFFF))
1245                 rt->dst.tclassid |= tag & 0xFFFF;
1246         if (!(rt->dst.tclassid & 0xFFFF0000))
1247                 rt->dst.tclassid |= tag & 0xFFFF0000;
1248 }
1249 #endif
1250
1251 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1252 {
1253         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1254         unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size,
1255                                     ip_rt_min_advmss);
1256
1257         return min(advmss, IPV4_MAX_PMTU - header_size);
1258 }
1259
1260 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1261 {
1262         const struct rtable *rt = (const struct rtable *) dst;
1263         unsigned int mtu = rt->rt_pmtu;
1264
1265         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1266                 mtu = dst_metric_raw(dst, RTAX_MTU);
1267
1268         if (mtu)
1269                 return mtu;
1270
1271         mtu = dst->dev->mtu;
1272
1273         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1274                 if (rt->rt_uses_gateway && mtu > 576)
1275                         mtu = 576;
1276         }
1277
1278         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1279
1280         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1281 }
1282
1283 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1284 {
1285         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1286         struct fib_nh_exception *fnhe;
1287         u32 hval;
1288
1289         if (!hash)
1290                 return NULL;
1291
1292         hval = fnhe_hashfun(daddr);
1293
1294         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1295              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1296                 if (fnhe->fnhe_daddr == daddr)
1297                         return fnhe;
1298         }
1299         return NULL;
1300 }
1301
1302 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1303                               __be32 daddr)
1304 {
1305         bool ret = false;
1306
1307         spin_lock_bh(&fnhe_lock);
1308
1309         if (daddr == fnhe->fnhe_daddr) {
1310                 struct rtable __rcu **porig;
1311                 struct rtable *orig;
1312                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1313
1314                 if (rt_is_input_route(rt))
1315                         porig = &fnhe->fnhe_rth_input;
1316                 else
1317                         porig = &fnhe->fnhe_rth_output;
1318                 orig = rcu_dereference(*porig);
1319
1320                 if (fnhe->fnhe_genid != genid) {
1321                         fnhe->fnhe_genid = genid;
1322                         fnhe->fnhe_gw = 0;
1323                         fnhe->fnhe_pmtu = 0;
1324                         fnhe->fnhe_expires = 0;
1325                         fnhe_flush_routes(fnhe);
1326                         orig = NULL;
1327                 }
1328                 fill_route_from_fnhe(rt, fnhe);
1329                 if (!rt->rt_gateway)
1330                         rt->rt_gateway = daddr;
1331
1332                 if (!(rt->dst.flags & DST_NOCACHE)) {
1333                         rcu_assign_pointer(*porig, rt);
1334                         if (orig)
1335                                 rt_free(orig);
1336                         ret = true;
1337                 }
1338
1339                 fnhe->fnhe_stamp = jiffies;
1340         }
1341         spin_unlock_bh(&fnhe_lock);
1342
1343         return ret;
1344 }
1345
1346 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1347 {
1348         struct rtable *orig, *prev, **p;
1349         bool ret = true;
1350
1351         if (rt_is_input_route(rt)) {
1352                 p = (struct rtable **)&nh->nh_rth_input;
1353         } else {
1354                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1355         }
1356         orig = *p;
1357
1358         prev = cmpxchg(p, orig, rt);
1359         if (prev == orig) {
1360                 if (orig)
1361                         rt_free(orig);
1362         } else
1363                 ret = false;
1364
1365         return ret;
1366 }
1367
1368 struct uncached_list {
1369         spinlock_t              lock;
1370         struct list_head        head;
1371 };
1372
1373 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1374
1375 static void rt_add_uncached_list(struct rtable *rt)
1376 {
1377         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1378
1379         rt->rt_uncached_list = ul;
1380
1381         spin_lock_bh(&ul->lock);
1382         list_add_tail(&rt->rt_uncached, &ul->head);
1383         spin_unlock_bh(&ul->lock);
1384 }
1385
1386 static void ipv4_dst_destroy(struct dst_entry *dst)
1387 {
1388         struct rtable *rt = (struct rtable *) dst;
1389
1390         if (rt->fi) {
1391                 fib_info_put(rt->fi);
1392                 rt->fi = NULL;
1393         }
1394
1395         if (!list_empty(&rt->rt_uncached)) {
1396                 struct uncached_list *ul = rt->rt_uncached_list;
1397
1398                 spin_lock_bh(&ul->lock);
1399                 list_del(&rt->rt_uncached);
1400                 spin_unlock_bh(&ul->lock);
1401         }
1402 }
1403
1404 void rt_flush_dev(struct net_device *dev)
1405 {
1406         struct net *net = dev_net(dev);
1407         struct rtable *rt;
1408         int cpu;
1409
1410         for_each_possible_cpu(cpu) {
1411                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1412
1413                 spin_lock_bh(&ul->lock);
1414                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1415                         if (rt->dst.dev != dev)
1416                                 continue;
1417                         rt->dst.dev = net->loopback_dev;
1418                         dev_hold(rt->dst.dev);
1419                         dev_put(dev);
1420                 }
1421                 spin_unlock_bh(&ul->lock);
1422         }
1423 }
1424
1425 static bool rt_cache_valid(const struct rtable *rt)
1426 {
1427         return  rt &&
1428                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1429                 !rt_is_expired(rt);
1430 }
1431
1432 static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
1433 {
1434         if (fi->fib_metrics != (u32 *)dst_default_metrics) {
1435                 fib_info_hold(fi);
1436                 rt->fi = fi;
1437         }
1438
1439         dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1440 }
1441
1442 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1443                            const struct fib_result *res,
1444                            struct fib_nh_exception *fnhe,
1445                            struct fib_info *fi, u16 type, u32 itag)
1446 {
1447         bool cached = false;
1448
1449         if (fi) {
1450                 struct fib_nh *nh = &FIB_RES_NH(*res);
1451
1452                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1453                         rt->rt_gateway = nh->nh_gw;
1454                         rt->rt_uses_gateway = 1;
1455                 }
1456                 rt_init_metrics(rt, fi);
1457 #ifdef CONFIG_IP_ROUTE_CLASSID
1458                 rt->dst.tclassid = nh->nh_tclassid;
1459 #endif
1460                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1461                 if (unlikely(fnhe))
1462                         cached = rt_bind_exception(rt, fnhe, daddr);
1463                 else if (!(rt->dst.flags & DST_NOCACHE))
1464                         cached = rt_cache_route(nh, rt);
1465                 if (unlikely(!cached)) {
1466                         /* Routes we intend to cache in nexthop exception or
1467                          * FIB nexthop have the DST_NOCACHE bit clear.
1468                          * However, if we are unsuccessful at storing this
1469                          * route into the cache we really need to set it.
1470                          */
1471                         rt->dst.flags |= DST_NOCACHE;
1472                         if (!rt->rt_gateway)
1473                                 rt->rt_gateway = daddr;
1474                         rt_add_uncached_list(rt);
1475                 }
1476         } else
1477                 rt_add_uncached_list(rt);
1478
1479 #ifdef CONFIG_IP_ROUTE_CLASSID
1480 #ifdef CONFIG_IP_MULTIPLE_TABLES
1481         set_class_tag(rt, res->tclassid);
1482 #endif
1483         set_class_tag(rt, itag);
1484 #endif
1485 }
1486
1487 struct rtable *rt_dst_alloc(struct net_device *dev,
1488                             unsigned int flags, u16 type,
1489                             bool nopolicy, bool noxfrm, bool will_cache)
1490 {
1491         struct rtable *rt;
1492
1493         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1494                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1495                        (nopolicy ? DST_NOPOLICY : 0) |
1496                        (noxfrm ? DST_NOXFRM : 0));
1497
1498         if (rt) {
1499                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1500                 rt->rt_flags = flags;
1501                 rt->rt_type = type;
1502                 rt->rt_is_input = 0;
1503                 rt->rt_iif = 0;
1504                 rt->rt_pmtu = 0;
1505                 rt->rt_gateway = 0;
1506                 rt->rt_uses_gateway = 0;
1507                 rt->rt_table_id = 0;
1508                 rt->fi = NULL;
1509                 INIT_LIST_HEAD(&rt->rt_uncached);
1510
1511                 rt->dst.output = ip_output;
1512                 if (flags & RTCF_LOCAL)
1513                         rt->dst.input = ip_local_deliver;
1514         }
1515
1516         return rt;
1517 }
1518 EXPORT_SYMBOL(rt_dst_alloc);
1519
1520 /* called in rcu_read_lock() section */
1521 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1522                                 u8 tos, struct net_device *dev, int our)
1523 {
1524         struct rtable *rth;
1525         struct in_device *in_dev = __in_dev_get_rcu(dev);
1526         unsigned int flags = RTCF_MULTICAST;
1527         u32 itag = 0;
1528         int err;
1529
1530         /* Primary sanity checks. */
1531
1532         if (!in_dev)
1533                 return -EINVAL;
1534
1535         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1536             skb->protocol != htons(ETH_P_IP))
1537                 goto e_inval;
1538
1539         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1540                 goto e_inval;
1541
1542         if (ipv4_is_zeronet(saddr)) {
1543                 if (!ipv4_is_local_multicast(daddr))
1544                         goto e_inval;
1545         } else {
1546                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1547                                           in_dev, &itag);
1548                 if (err < 0)
1549                         goto e_err;
1550         }
1551         if (our)
1552                 flags |= RTCF_LOCAL;
1553
1554         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1555                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1556         if (!rth)
1557                 goto e_nobufs;
1558
1559 #ifdef CONFIG_IP_ROUTE_CLASSID
1560         rth->dst.tclassid = itag;
1561 #endif
1562         rth->dst.output = ip_rt_bug;
1563         rth->rt_is_input= 1;
1564
1565 #ifdef CONFIG_IP_MROUTE
1566         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1567                 rth->dst.input = ip_mr_input;
1568 #endif
1569         RT_CACHE_STAT_INC(in_slow_mc);
1570
1571         skb_dst_set(skb, &rth->dst);
1572         return 0;
1573
1574 e_nobufs:
1575         return -ENOBUFS;
1576 e_inval:
1577         return -EINVAL;
1578 e_err:
1579         return err;
1580 }
1581
1582
1583 static void ip_handle_martian_source(struct net_device *dev,
1584                                      struct in_device *in_dev,
1585                                      struct sk_buff *skb,
1586                                      __be32 daddr,
1587                                      __be32 saddr)
1588 {
1589         RT_CACHE_STAT_INC(in_martian_src);
1590 #ifdef CONFIG_IP_ROUTE_VERBOSE
1591         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1592                 /*
1593                  *      RFC1812 recommendation, if source is martian,
1594                  *      the only hint is MAC header.
1595                  */
1596                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1597                         &daddr, &saddr, dev->name);
1598                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1599                         print_hex_dump(KERN_WARNING, "ll header: ",
1600                                        DUMP_PREFIX_OFFSET, 16, 1,
1601                                        skb_mac_header(skb),
1602                                        dev->hard_header_len, true);
1603                 }
1604         }
1605 #endif
1606 }
1607
1608 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1609 {
1610         struct fnhe_hash_bucket *hash;
1611         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1612         u32 hval = fnhe_hashfun(daddr);
1613
1614         spin_lock_bh(&fnhe_lock);
1615
1616         hash = rcu_dereference_protected(nh->nh_exceptions,
1617                                          lockdep_is_held(&fnhe_lock));
1618         hash += hval;
1619
1620         fnhe_p = &hash->chain;
1621         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1622         while (fnhe) {
1623                 if (fnhe->fnhe_daddr == daddr) {
1624                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1625                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1626                         fnhe_flush_routes(fnhe);
1627                         kfree_rcu(fnhe, rcu);
1628                         break;
1629                 }
1630                 fnhe_p = &fnhe->fnhe_next;
1631                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1632                                                  lockdep_is_held(&fnhe_lock));
1633         }
1634
1635         spin_unlock_bh(&fnhe_lock);
1636 }
1637
1638 static void set_lwt_redirect(struct rtable *rth)
1639 {
1640         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1641                 rth->dst.lwtstate->orig_output = rth->dst.output;
1642                 rth->dst.output = lwtunnel_output;
1643         }
1644
1645         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1646                 rth->dst.lwtstate->orig_input = rth->dst.input;
1647                 rth->dst.input = lwtunnel_input;
1648         }
1649 }
1650
1651 /* called in rcu_read_lock() section */
1652 static int __mkroute_input(struct sk_buff *skb,
1653                            const struct fib_result *res,
1654                            struct in_device *in_dev,
1655                            __be32 daddr, __be32 saddr, u32 tos)
1656 {
1657         struct fib_nh_exception *fnhe;
1658         struct rtable *rth;
1659         int err;
1660         struct in_device *out_dev;
1661         bool do_cache;
1662         u32 itag = 0;
1663
1664         /* get a working reference to the output device */
1665         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1666         if (!out_dev) {
1667                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1668                 return -EINVAL;
1669         }
1670
1671         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1672                                   in_dev->dev, in_dev, &itag);
1673         if (err < 0) {
1674                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1675                                          saddr);
1676
1677                 goto cleanup;
1678         }
1679
1680         do_cache = res->fi && !itag;
1681         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1682             skb->protocol == htons(ETH_P_IP) &&
1683             (IN_DEV_SHARED_MEDIA(out_dev) ||
1684              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1685                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1686
1687         if (skb->protocol != htons(ETH_P_IP)) {
1688                 /* Not IP (i.e. ARP). Do not create route, if it is
1689                  * invalid for proxy arp. DNAT routes are always valid.
1690                  *
1691                  * Proxy arp feature have been extended to allow, ARP
1692                  * replies back to the same interface, to support
1693                  * Private VLAN switch technologies. See arp.c.
1694                  */
1695                 if (out_dev == in_dev &&
1696                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1697                         err = -EINVAL;
1698                         goto cleanup;
1699                 }
1700         }
1701
1702         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1703         if (do_cache) {
1704                 if (fnhe) {
1705                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1706                         if (rth && rth->dst.expires &&
1707                             time_after(jiffies, rth->dst.expires)) {
1708                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1709                                 fnhe = NULL;
1710                         } else {
1711                                 goto rt_cache;
1712                         }
1713                 }
1714
1715                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1716
1717 rt_cache:
1718                 if (rt_cache_valid(rth)) {
1719                         skb_dst_set_noref(skb, &rth->dst);
1720                         goto out;
1721                 }
1722         }
1723
1724         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1725                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1726                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1727         if (!rth) {
1728                 err = -ENOBUFS;
1729                 goto cleanup;
1730         }
1731
1732         rth->rt_is_input = 1;
1733         if (res->table)
1734                 rth->rt_table_id = res->table->tb_id;
1735         RT_CACHE_STAT_INC(in_slow_tot);
1736
1737         rth->dst.input = ip_forward;
1738
1739         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1740         set_lwt_redirect(rth);
1741         skb_dst_set(skb, &rth->dst);
1742 out:
1743         err = 0;
1744  cleanup:
1745         return err;
1746 }
1747
1748 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1749 /* To make ICMP packets follow the right flow, the multipath hash is
1750  * calculated from the inner IP addresses.
1751  */
1752 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1753                                  struct flow_keys *hash_keys)
1754 {
1755         const struct iphdr *outer_iph = ip_hdr(skb);
1756         const struct iphdr *inner_iph;
1757         const struct icmphdr *icmph;
1758         struct iphdr _inner_iph;
1759         struct icmphdr _icmph;
1760
1761         hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1762         hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1763         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1764                 return;
1765
1766         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1767                 return;
1768
1769         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1770                                    &_icmph);
1771         if (!icmph)
1772                 return;
1773
1774         if (icmph->type != ICMP_DEST_UNREACH &&
1775             icmph->type != ICMP_REDIRECT &&
1776             icmph->type != ICMP_TIME_EXCEEDED &&
1777             icmph->type != ICMP_PARAMETERPROB)
1778                 return;
1779
1780         inner_iph = skb_header_pointer(skb,
1781                                        outer_iph->ihl * 4 + sizeof(_icmph),
1782                                        sizeof(_inner_iph), &_inner_iph);
1783         if (!inner_iph)
1784                 return;
1785         hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1786         hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1787 }
1788
1789 /* if skb is set it will be used and fl4 can be NULL */
1790 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1791                        const struct sk_buff *skb)
1792 {
1793         struct net *net = fi->fib_net;
1794         struct flow_keys hash_keys;
1795         u32 mhash;
1796
1797         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1798         case 0:
1799                 memset(&hash_keys, 0, sizeof(hash_keys));
1800                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1801                 if (skb) {
1802                         ip_multipath_l3_keys(skb, &hash_keys);
1803                 } else {
1804                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1805                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1806                 }
1807                 break;
1808         case 1:
1809                 /* skb is currently provided only when forwarding */
1810                 if (skb) {
1811                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1812                         struct flow_keys keys;
1813
1814                         /* short-circuit if we already have L4 hash present */
1815                         if (skb->l4_hash)
1816                                 return skb_get_hash_raw(skb) >> 1;
1817                         memset(&hash_keys, 0, sizeof(hash_keys));
1818                         skb_flow_dissect_flow_keys(skb, &keys, flag);
1819                         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1820                         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1821                         hash_keys.ports.src = keys.ports.src;
1822                         hash_keys.ports.dst = keys.ports.dst;
1823                         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1824                 } else {
1825                         memset(&hash_keys, 0, sizeof(hash_keys));
1826                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1827                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1828                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1829                         hash_keys.ports.src = fl4->fl4_sport;
1830                         hash_keys.ports.dst = fl4->fl4_dport;
1831                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1832                 }
1833                 break;
1834         }
1835         mhash = flow_hash_from_keys(&hash_keys);
1836
1837         return mhash >> 1;
1838 }
1839 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1840 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1841
1842 static int ip_mkroute_input(struct sk_buff *skb,
1843                             struct fib_result *res,
1844                             struct in_device *in_dev,
1845                             __be32 daddr, __be32 saddr, u32 tos)
1846 {
1847 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1848         if (res->fi && res->fi->fib_nhs > 1) {
1849                 int h = fib_multipath_hash(res->fi, NULL, skb);
1850
1851                 fib_select_multipath(res, h);
1852         }
1853 #endif
1854
1855         /* create a routing cache entry */
1856         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1857 }
1858
1859 /*
1860  *      NOTE. We drop all the packets that has local source
1861  *      addresses, because every properly looped back packet
1862  *      must have correct destination already attached by output routine.
1863  *
1864  *      Such approach solves two big problems:
1865  *      1. Not simplex devices are handled properly.
1866  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1867  *      called with rcu_read_lock()
1868  */
1869
1870 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1871                                u8 tos, struct net_device *dev)
1872 {
1873         struct fib_result res;
1874         struct in_device *in_dev = __in_dev_get_rcu(dev);
1875         struct ip_tunnel_info *tun_info;
1876         struct flowi4   fl4;
1877         unsigned int    flags = 0;
1878         u32             itag = 0;
1879         struct rtable   *rth;
1880         int             err = -EINVAL;
1881         struct net    *net = dev_net(dev);
1882         bool do_cache;
1883
1884         /* IP on this device is disabled. */
1885
1886         if (!in_dev)
1887                 goto out;
1888
1889         /* Check for the most weird martians, which can be not detected
1890            by fib_lookup.
1891          */
1892
1893         tun_info = skb_tunnel_info(skb);
1894         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1895                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1896         else
1897                 fl4.flowi4_tun_key.tun_id = 0;
1898         skb_dst_drop(skb);
1899
1900         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1901                 goto martian_source;
1902
1903         res.fi = NULL;
1904         res.table = NULL;
1905         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1906                 goto brd_input;
1907
1908         /* Accept zero addresses only to limited broadcast;
1909          * I even do not know to fix it or not. Waiting for complains :-)
1910          */
1911         if (ipv4_is_zeronet(saddr))
1912                 goto martian_source;
1913
1914         if (ipv4_is_zeronet(daddr))
1915                 goto martian_destination;
1916
1917         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1918          * and call it once if daddr or/and saddr are loopback addresses
1919          */
1920         if (ipv4_is_loopback(daddr)) {
1921                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1922                         goto martian_destination;
1923         } else if (ipv4_is_loopback(saddr)) {
1924                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1925                         goto martian_source;
1926         }
1927
1928         /*
1929          *      Now we are ready to route packet.
1930          */
1931         fl4.flowi4_oif = 0;
1932         fl4.flowi4_iif = dev->ifindex;
1933         fl4.flowi4_mark = skb->mark;
1934         fl4.flowi4_tos = tos;
1935         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1936         fl4.flowi4_flags = 0;
1937         fl4.daddr = daddr;
1938         fl4.saddr = saddr;
1939         fl4.flowi4_uid = sock_net_uid(net, NULL);
1940         err = fib_lookup(net, &fl4, &res, 0);
1941         if (err != 0) {
1942                 if (!IN_DEV_FORWARD(in_dev))
1943                         err = -EHOSTUNREACH;
1944                 goto no_route;
1945         }
1946
1947         if (res.type == RTN_BROADCAST)
1948                 goto brd_input;
1949
1950         if (res.type == RTN_LOCAL) {
1951                 err = fib_validate_source(skb, saddr, daddr, tos,
1952                                           0, dev, in_dev, &itag);
1953                 if (err < 0)
1954                         goto martian_source;
1955                 goto local_input;
1956         }
1957
1958         if (!IN_DEV_FORWARD(in_dev)) {
1959                 err = -EHOSTUNREACH;
1960                 goto no_route;
1961         }
1962         if (res.type != RTN_UNICAST)
1963                 goto martian_destination;
1964
1965         err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos);
1966 out:    return err;
1967
1968 brd_input:
1969         if (skb->protocol != htons(ETH_P_IP))
1970                 goto e_inval;
1971
1972         if (!ipv4_is_zeronet(saddr)) {
1973                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1974                                           in_dev, &itag);
1975                 if (err < 0)
1976                         goto martian_source;
1977         }
1978         flags |= RTCF_BROADCAST;
1979         res.type = RTN_BROADCAST;
1980         RT_CACHE_STAT_INC(in_brd);
1981
1982 local_input:
1983         do_cache = false;
1984         if (res.fi) {
1985                 if (!itag) {
1986                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1987                         if (rt_cache_valid(rth)) {
1988                                 skb_dst_set_noref(skb, &rth->dst);
1989                                 err = 0;
1990                                 goto out;
1991                         }
1992                         do_cache = true;
1993                 }
1994         }
1995
1996         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
1997                            flags | RTCF_LOCAL, res.type,
1998                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1999         if (!rth)
2000                 goto e_nobufs;
2001
2002         rth->dst.output= ip_rt_bug;
2003 #ifdef CONFIG_IP_ROUTE_CLASSID
2004         rth->dst.tclassid = itag;
2005 #endif
2006         rth->rt_is_input = 1;
2007         if (res.table)
2008                 rth->rt_table_id = res.table->tb_id;
2009
2010         RT_CACHE_STAT_INC(in_slow_tot);
2011         if (res.type == RTN_UNREACHABLE) {
2012                 rth->dst.input= ip_error;
2013                 rth->dst.error= -err;
2014                 rth->rt_flags   &= ~RTCF_LOCAL;
2015         }
2016
2017         if (do_cache) {
2018                 struct fib_nh *nh = &FIB_RES_NH(res);
2019
2020                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2021                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2022                         WARN_ON(rth->dst.input == lwtunnel_input);
2023                         rth->dst.lwtstate->orig_input = rth->dst.input;
2024                         rth->dst.input = lwtunnel_input;
2025                 }
2026
2027                 if (unlikely(!rt_cache_route(nh, rth))) {
2028                         rth->dst.flags |= DST_NOCACHE;
2029                         rt_add_uncached_list(rth);
2030                 }
2031         }
2032         skb_dst_set(skb, &rth->dst);
2033         err = 0;
2034         goto out;
2035
2036 no_route:
2037         RT_CACHE_STAT_INC(in_no_route);
2038         res.type = RTN_UNREACHABLE;
2039         res.fi = NULL;
2040         res.table = NULL;
2041         goto local_input;
2042
2043         /*
2044          *      Do not cache martian addresses: they should be logged (RFC1812)
2045          */
2046 martian_destination:
2047         RT_CACHE_STAT_INC(in_martian_dst);
2048 #ifdef CONFIG_IP_ROUTE_VERBOSE
2049         if (IN_DEV_LOG_MARTIANS(in_dev))
2050                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2051                                      &daddr, &saddr, dev->name);
2052 #endif
2053
2054 e_inval:
2055         err = -EINVAL;
2056         goto out;
2057
2058 e_nobufs:
2059         err = -ENOBUFS;
2060         goto out;
2061
2062 martian_source:
2063         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2064         goto out;
2065 }
2066
2067 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2068                          u8 tos, struct net_device *dev)
2069 {
2070         int res;
2071
2072         tos &= IPTOS_RT_MASK;
2073         rcu_read_lock();
2074
2075         /* Multicast recognition logic is moved from route cache to here.
2076            The problem was that too many Ethernet cards have broken/missing
2077            hardware multicast filters :-( As result the host on multicasting
2078            network acquires a lot of useless route cache entries, sort of
2079            SDR messages from all the world. Now we try to get rid of them.
2080            Really, provided software IP multicast filter is organized
2081            reasonably (at least, hashed), it does not result in a slowdown
2082            comparing with route cache reject entries.
2083            Note, that multicast routers are not affected, because
2084            route cache entry is created eventually.
2085          */
2086         if (ipv4_is_multicast(daddr)) {
2087                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2088                 int our = 0;
2089
2090                 if (in_dev)
2091                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2092                                               ip_hdr(skb)->protocol);
2093
2094                 /* check l3 master if no match yet */
2095                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2096                         struct in_device *l3_in_dev;
2097
2098                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2099                         if (l3_in_dev)
2100                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2101                                                       ip_hdr(skb)->protocol);
2102                 }
2103
2104                 res = -EINVAL;
2105                 if (our
2106 #ifdef CONFIG_IP_MROUTE
2107                         ||
2108                     (!ipv4_is_local_multicast(daddr) &&
2109                      IN_DEV_MFORWARD(in_dev))
2110 #endif
2111                    ) {
2112                         res = ip_route_input_mc(skb, daddr, saddr,
2113                                                 tos, dev, our);
2114                 }
2115                 rcu_read_unlock();
2116                 return res;
2117         }
2118         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2119         rcu_read_unlock();
2120         return res;
2121 }
2122 EXPORT_SYMBOL(ip_route_input_noref);
2123
2124 /* called with rcu_read_lock() */
2125 static struct rtable *__mkroute_output(const struct fib_result *res,
2126                                        const struct flowi4 *fl4, int orig_oif,
2127                                        struct net_device *dev_out,
2128                                        unsigned int flags)
2129 {
2130         struct fib_info *fi = res->fi;
2131         struct fib_nh_exception *fnhe;
2132         struct in_device *in_dev;
2133         u16 type = res->type;
2134         struct rtable *rth;
2135         bool do_cache;
2136
2137         in_dev = __in_dev_get_rcu(dev_out);
2138         if (!in_dev)
2139                 return ERR_PTR(-EINVAL);
2140
2141         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2142                 if (ipv4_is_loopback(fl4->saddr) &&
2143                     !(dev_out->flags & IFF_LOOPBACK) &&
2144                     !netif_is_l3_master(dev_out))
2145                         return ERR_PTR(-EINVAL);
2146
2147         if (ipv4_is_lbcast(fl4->daddr))
2148                 type = RTN_BROADCAST;
2149         else if (ipv4_is_multicast(fl4->daddr))
2150                 type = RTN_MULTICAST;
2151         else if (ipv4_is_zeronet(fl4->daddr))
2152                 return ERR_PTR(-EINVAL);
2153
2154         if (dev_out->flags & IFF_LOOPBACK)
2155                 flags |= RTCF_LOCAL;
2156
2157         do_cache = true;
2158         if (type == RTN_BROADCAST) {
2159                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2160                 fi = NULL;
2161         } else if (type == RTN_MULTICAST) {
2162                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2163                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2164                                      fl4->flowi4_proto))
2165                         flags &= ~RTCF_LOCAL;
2166                 else
2167                         do_cache = false;
2168                 /* If multicast route do not exist use
2169                  * default one, but do not gateway in this case.
2170                  * Yes, it is hack.
2171                  */
2172                 if (fi && res->prefixlen < 4)
2173                         fi = NULL;
2174         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2175                    (orig_oif != dev_out->ifindex)) {
2176                 /* For local routes that require a particular output interface
2177                  * we do not want to cache the result.  Caching the result
2178                  * causes incorrect behaviour when there are multiple source
2179                  * addresses on the interface, the end result being that if the
2180                  * intended recipient is waiting on that interface for the
2181                  * packet he won't receive it because it will be delivered on
2182                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2183                  * be set to the loopback interface as well.
2184                  */
2185                 fi = NULL;
2186         }
2187
2188         fnhe = NULL;
2189         do_cache &= fi != NULL;
2190         if (do_cache) {
2191                 struct rtable __rcu **prth;
2192                 struct fib_nh *nh = &FIB_RES_NH(*res);
2193
2194                 fnhe = find_exception(nh, fl4->daddr);
2195                 if (fnhe) {
2196                         prth = &fnhe->fnhe_rth_output;
2197                         rth = rcu_dereference(*prth);
2198                         if (rth && rth->dst.expires &&
2199                             time_after(jiffies, rth->dst.expires)) {
2200                                 ip_del_fnhe(nh, fl4->daddr);
2201                                 fnhe = NULL;
2202                         } else {
2203                                 goto rt_cache;
2204                         }
2205                 }
2206
2207                 if (unlikely(fl4->flowi4_flags &
2208                              FLOWI_FLAG_KNOWN_NH &&
2209                              !(nh->nh_gw &&
2210                                nh->nh_scope == RT_SCOPE_LINK))) {
2211                         do_cache = false;
2212                         goto add;
2213                 }
2214                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2215                 rth = rcu_dereference(*prth);
2216
2217 rt_cache:
2218                 if (rt_cache_valid(rth)) {
2219                         dst_hold(&rth->dst);
2220                         return rth;
2221                 }
2222         }
2223
2224 add:
2225         rth = rt_dst_alloc(dev_out, flags, type,
2226                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2227                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2228                            do_cache);
2229         if (!rth)
2230                 return ERR_PTR(-ENOBUFS);
2231
2232         rth->rt_iif     = orig_oif ? : 0;
2233         if (res->table)
2234                 rth->rt_table_id = res->table->tb_id;
2235
2236         RT_CACHE_STAT_INC(out_slow_tot);
2237
2238         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2239                 if (flags & RTCF_LOCAL &&
2240                     !(dev_out->flags & IFF_LOOPBACK)) {
2241                         rth->dst.output = ip_mc_output;
2242                         RT_CACHE_STAT_INC(out_slow_mc);
2243                 }
2244 #ifdef CONFIG_IP_MROUTE
2245                 if (type == RTN_MULTICAST) {
2246                         if (IN_DEV_MFORWARD(in_dev) &&
2247                             !ipv4_is_local_multicast(fl4->daddr)) {
2248                                 rth->dst.input = ip_mr_input;
2249                                 rth->dst.output = ip_mc_output;
2250                         }
2251                 }
2252 #endif
2253         }
2254
2255         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2256         set_lwt_redirect(rth);
2257
2258         return rth;
2259 }
2260
2261 /*
2262  * Major route resolver routine.
2263  */
2264
2265 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2266                                           const struct sk_buff *skb)
2267 {
2268         struct net_device *dev_out = NULL;
2269         __u8 tos = RT_FL_TOS(fl4);
2270         unsigned int flags = 0;
2271         struct fib_result res;
2272         struct rtable *rth;
2273         int orig_oif;
2274         int err = -ENETUNREACH;
2275
2276         res.tclassid    = 0;
2277         res.fi          = NULL;
2278         res.table       = NULL;
2279
2280         orig_oif = fl4->flowi4_oif;
2281
2282         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2283         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2284         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2285                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2286
2287         rcu_read_lock();
2288         if (fl4->saddr) {
2289                 rth = ERR_PTR(-EINVAL);
2290                 if (ipv4_is_multicast(fl4->saddr) ||
2291                     ipv4_is_lbcast(fl4->saddr) ||
2292                     ipv4_is_zeronet(fl4->saddr))
2293                         goto out;
2294
2295                 /* I removed check for oif == dev_out->oif here.
2296                    It was wrong for two reasons:
2297                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2298                       is assigned to multiple interfaces.
2299                    2. Moreover, we are allowed to send packets with saddr
2300                       of another iface. --ANK
2301                  */
2302
2303                 if (fl4->flowi4_oif == 0 &&
2304                     (ipv4_is_multicast(fl4->daddr) ||
2305                      ipv4_is_lbcast(fl4->daddr))) {
2306                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2307                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2308                         if (!dev_out)
2309                                 goto out;
2310
2311                         /* Special hack: user can direct multicasts
2312                            and limited broadcast via necessary interface
2313                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2314                            This hack is not just for fun, it allows
2315                            vic,vat and friends to work.
2316                            They bind socket to loopback, set ttl to zero
2317                            and expect that it will work.
2318                            From the viewpoint of routing cache they are broken,
2319                            because we are not allowed to build multicast path
2320                            with loopback source addr (look, routing cache
2321                            cannot know, that ttl is zero, so that packet
2322                            will not leave this host and route is valid).
2323                            Luckily, this hack is good workaround.
2324                          */
2325
2326                         fl4->flowi4_oif = dev_out->ifindex;
2327                         goto make_route;
2328                 }
2329
2330                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2331                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2332                         if (!__ip_dev_find(net, fl4->saddr, false))
2333                                 goto out;
2334                 }
2335         }
2336
2337
2338         if (fl4->flowi4_oif) {
2339                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2340                 rth = ERR_PTR(-ENODEV);
2341                 if (!dev_out)
2342                         goto out;
2343
2344                 /* RACE: Check return value of inet_select_addr instead. */
2345                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2346                         rth = ERR_PTR(-ENETUNREACH);
2347                         goto out;
2348                 }
2349                 if (ipv4_is_local_multicast(fl4->daddr) ||
2350                     ipv4_is_lbcast(fl4->daddr) ||
2351                     fl4->flowi4_proto == IPPROTO_IGMP) {
2352                         if (!fl4->saddr)
2353                                 fl4->saddr = inet_select_addr(dev_out, 0,
2354                                                               RT_SCOPE_LINK);
2355                         goto make_route;
2356                 }
2357                 if (!fl4->saddr) {
2358                         if (ipv4_is_multicast(fl4->daddr))
2359                                 fl4->saddr = inet_select_addr(dev_out, 0,
2360                                                               fl4->flowi4_scope);
2361                         else if (!fl4->daddr)
2362                                 fl4->saddr = inet_select_addr(dev_out, 0,
2363                                                               RT_SCOPE_HOST);
2364                 }
2365         }
2366
2367         if (!fl4->daddr) {
2368                 fl4->daddr = fl4->saddr;
2369                 if (!fl4->daddr)
2370                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2371                 dev_out = net->loopback_dev;
2372                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2373                 res.type = RTN_LOCAL;
2374                 flags |= RTCF_LOCAL;
2375                 goto make_route;
2376         }
2377
2378         err = fib_lookup(net, fl4, &res, 0);
2379         if (err) {
2380                 res.fi = NULL;
2381                 res.table = NULL;
2382                 if (fl4->flowi4_oif &&
2383                     (ipv4_is_multicast(fl4->daddr) ||
2384                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2385                         /* Apparently, routing tables are wrong. Assume,
2386                            that the destination is on link.
2387
2388                            WHY? DW.
2389                            Because we are allowed to send to iface
2390                            even if it has NO routes and NO assigned
2391                            addresses. When oif is specified, routing
2392                            tables are looked up with only one purpose:
2393                            to catch if destination is gatewayed, rather than
2394                            direct. Moreover, if MSG_DONTROUTE is set,
2395                            we send packet, ignoring both routing tables
2396                            and ifaddr state. --ANK
2397
2398
2399                            We could make it even if oif is unknown,
2400                            likely IPv6, but we do not.
2401                          */
2402
2403                         if (fl4->saddr == 0)
2404                                 fl4->saddr = inet_select_addr(dev_out, 0,
2405                                                               RT_SCOPE_LINK);
2406                         res.type = RTN_UNICAST;
2407                         goto make_route;
2408                 }
2409                 rth = ERR_PTR(err);
2410                 goto out;
2411         }
2412
2413         if (res.type == RTN_LOCAL) {
2414                 if (!fl4->saddr) {
2415                         if (res.fi->fib_prefsrc)
2416                                 fl4->saddr = res.fi->fib_prefsrc;
2417                         else
2418                                 fl4->saddr = fl4->daddr;
2419                 }
2420
2421                 /* L3 master device is the loopback for that domain */
2422                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(res)) ? :
2423                         net->loopback_dev;
2424                 fl4->flowi4_oif = dev_out->ifindex;
2425                 flags |= RTCF_LOCAL;
2426                 goto make_route;
2427         }
2428
2429         fib_select_path(net, &res, fl4, skb);
2430
2431         dev_out = FIB_RES_DEV(res);
2432         fl4->flowi4_oif = dev_out->ifindex;
2433
2434
2435 make_route:
2436         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2437
2438 out:
2439         rcu_read_unlock();
2440         return rth;
2441 }
2442 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2443
2444 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2445 {
2446         return NULL;
2447 }
2448
2449 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2450 {
2451         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2452
2453         return mtu ? : dst->dev->mtu;
2454 }
2455
2456 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2457                                           struct sk_buff *skb, u32 mtu)
2458 {
2459 }
2460
2461 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2462                                        struct sk_buff *skb)
2463 {
2464 }
2465
2466 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2467                                           unsigned long old)
2468 {
2469         return NULL;
2470 }
2471
2472 static struct dst_ops ipv4_dst_blackhole_ops = {
2473         .family                 =       AF_INET,
2474         .check                  =       ipv4_blackhole_dst_check,
2475         .mtu                    =       ipv4_blackhole_mtu,
2476         .default_advmss         =       ipv4_default_advmss,
2477         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2478         .redirect               =       ipv4_rt_blackhole_redirect,
2479         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2480         .neigh_lookup           =       ipv4_neigh_lookup,
2481 };
2482
2483 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2484 {
2485         struct rtable *ort = (struct rtable *) dst_orig;
2486         struct rtable *rt;
2487
2488         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2489         if (rt) {
2490                 struct dst_entry *new = &rt->dst;
2491
2492                 new->__use = 1;
2493                 new->input = dst_discard;
2494                 new->output = dst_discard_out;
2495
2496                 new->dev = ort->dst.dev;
2497                 if (new->dev)
2498                         dev_hold(new->dev);
2499
2500                 rt->rt_is_input = ort->rt_is_input;
2501                 rt->rt_iif = ort->rt_iif;
2502                 rt->rt_pmtu = ort->rt_pmtu;
2503
2504                 rt->rt_genid = rt_genid_ipv4(net);
2505                 rt->rt_flags = ort->rt_flags;
2506                 rt->rt_type = ort->rt_type;
2507                 rt->rt_gateway = ort->rt_gateway;
2508                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2509
2510                 INIT_LIST_HEAD(&rt->rt_uncached);
2511                 dst_free(new);
2512         }
2513
2514         dst_release(dst_orig);
2515
2516         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2517 }
2518
2519 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2520                                     const struct sock *sk)
2521 {
2522         struct rtable *rt = __ip_route_output_key(net, flp4);
2523
2524         if (IS_ERR(rt))
2525                 return rt;
2526
2527         if (flp4->flowi4_proto)
2528                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2529                                                         flowi4_to_flowi(flp4),
2530                                                         sk, 0);
2531
2532         return rt;
2533 }
2534 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2535
2536 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2537                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2538                         u32 seq, int event)
2539 {
2540         struct rtable *rt = skb_rtable(skb);
2541         struct rtmsg *r;
2542         struct nlmsghdr *nlh;
2543         unsigned long expires = 0;
2544         u32 error;
2545         u32 metrics[RTAX_MAX];
2546
2547         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0);
2548         if (!nlh)
2549                 return -EMSGSIZE;
2550
2551         r = nlmsg_data(nlh);
2552         r->rtm_family    = AF_INET;
2553         r->rtm_dst_len  = 32;
2554         r->rtm_src_len  = 0;
2555         r->rtm_tos      = fl4->flowi4_tos;
2556         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2557         if (nla_put_u32(skb, RTA_TABLE, table_id))
2558                 goto nla_put_failure;
2559         r->rtm_type     = rt->rt_type;
2560         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2561         r->rtm_protocol = RTPROT_UNSPEC;
2562         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2563         if (rt->rt_flags & RTCF_NOTIFY)
2564                 r->rtm_flags |= RTM_F_NOTIFY;
2565         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2566                 r->rtm_flags |= RTCF_DOREDIRECT;
2567
2568         if (nla_put_in_addr(skb, RTA_DST, dst))
2569                 goto nla_put_failure;
2570         if (src) {
2571                 r->rtm_src_len = 32;
2572                 if (nla_put_in_addr(skb, RTA_SRC, src))
2573                         goto nla_put_failure;
2574         }
2575         if (rt->dst.dev &&
2576             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2577                 goto nla_put_failure;
2578 #ifdef CONFIG_IP_ROUTE_CLASSID
2579         if (rt->dst.tclassid &&
2580             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2581                 goto nla_put_failure;
2582 #endif
2583         if (!rt_is_input_route(rt) &&
2584             fl4->saddr != src) {
2585                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2586                         goto nla_put_failure;
2587         }
2588         if (rt->rt_uses_gateway &&
2589             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2590                 goto nla_put_failure;
2591
2592         expires = rt->dst.expires;
2593         if (expires) {
2594                 unsigned long now = jiffies;
2595
2596                 if (time_before(now, expires))
2597                         expires -= now;
2598                 else
2599                         expires = 0;
2600         }
2601
2602         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2603         if (rt->rt_pmtu && expires)
2604                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2605         if (rtnetlink_put_metrics(skb, metrics) < 0)
2606                 goto nla_put_failure;
2607
2608         if (fl4->flowi4_mark &&
2609             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2610                 goto nla_put_failure;
2611
2612         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2613             nla_put_u32(skb, RTA_UID,
2614                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2615                 goto nla_put_failure;
2616
2617         error = rt->dst.error;
2618
2619         if (rt_is_input_route(rt)) {
2620 #ifdef CONFIG_IP_MROUTE
2621                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2622                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2623                         int err = ipmr_get_route(net, skb,
2624                                                  fl4->saddr, fl4->daddr,
2625                                                  r, portid);
2626
2627                         if (err <= 0) {
2628                                 if (err == 0)
2629                                         return 0;
2630                                 goto nla_put_failure;
2631                         }
2632                 } else
2633 #endif
2634                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2635                                 goto nla_put_failure;
2636         }
2637
2638         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2639                 goto nla_put_failure;
2640
2641         nlmsg_end(skb, nlh);
2642         return 0;
2643
2644 nla_put_failure:
2645         nlmsg_cancel(skb, nlh);
2646         return -EMSGSIZE;
2647 }
2648
2649 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2650                              struct netlink_ext_ack *extack)
2651 {
2652         struct net *net = sock_net(in_skb->sk);
2653         struct rtmsg *rtm;
2654         struct nlattr *tb[RTA_MAX+1];
2655         struct rtable *rt = NULL;
2656         struct flowi4 fl4;
2657         __be32 dst = 0;
2658         __be32 src = 0;
2659         u32 iif;
2660         int err;
2661         int mark;
2662         struct sk_buff *skb;
2663         u32 table_id = RT_TABLE_MAIN;
2664         kuid_t uid;
2665
2666         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2667                           extack);
2668         if (err < 0)
2669                 goto errout;
2670
2671         rtm = nlmsg_data(nlh);
2672
2673         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2674         if (!skb) {
2675                 err = -ENOBUFS;
2676                 goto errout;
2677         }
2678
2679         /* Reserve room for dummy headers, this skb can pass
2680            through good chunk of routing engine.
2681          */
2682         skb_reset_mac_header(skb);
2683         skb_reset_network_header(skb);
2684
2685         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2686         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2687         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2688         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2689         if (tb[RTA_UID])
2690                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2691         else
2692                 uid = (iif ? INVALID_UID : current_uid());
2693
2694         /* Bugfix: need to give ip_route_input enough of an IP header to
2695          * not gag.
2696          */
2697         ip_hdr(skb)->protocol = IPPROTO_UDP;
2698         ip_hdr(skb)->saddr = src;
2699         ip_hdr(skb)->daddr = dst;
2700
2701         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2702
2703         memset(&fl4, 0, sizeof(fl4));
2704         fl4.daddr = dst;
2705         fl4.saddr = src;
2706         fl4.flowi4_tos = rtm->rtm_tos;
2707         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2708         fl4.flowi4_mark = mark;
2709         fl4.flowi4_uid = uid;
2710
2711         if (iif) {
2712                 struct net_device *dev;
2713
2714                 dev = __dev_get_by_index(net, iif);
2715                 if (!dev) {
2716                         err = -ENODEV;
2717                         goto errout_free;
2718                 }
2719
2720                 skb->protocol   = htons(ETH_P_IP);
2721                 skb->dev        = dev;
2722                 skb->mark       = mark;
2723                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2724
2725                 rt = skb_rtable(skb);
2726                 if (err == 0 && rt->dst.error)
2727                         err = -rt->dst.error;
2728         } else {
2729                 rt = ip_route_output_key(net, &fl4);
2730
2731                 err = 0;
2732                 if (IS_ERR(rt))
2733                         err = PTR_ERR(rt);
2734         }
2735
2736         if (err)
2737                 goto errout_free;
2738
2739         skb_dst_set(skb, &rt->dst);
2740         if (rtm->rtm_flags & RTM_F_NOTIFY)
2741                 rt->rt_flags |= RTCF_NOTIFY;
2742
2743         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2744                 table_id = rt->rt_table_id;
2745
2746         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2747                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2748                            RTM_NEWROUTE);
2749         if (err < 0)
2750                 goto errout_free;
2751
2752         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2753 errout:
2754         return err;
2755
2756 errout_free:
2757         kfree_skb(skb);
2758         goto errout;
2759 }
2760
2761 void ip_rt_multicast_event(struct in_device *in_dev)
2762 {
2763         rt_cache_flush(dev_net(in_dev->dev));
2764 }
2765
2766 #ifdef CONFIG_SYSCTL
2767 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2768 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2769 static int ip_rt_gc_elasticity __read_mostly    = 8;
2770
2771 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2772                                         void __user *buffer,
2773                                         size_t *lenp, loff_t *ppos)
2774 {
2775         struct net *net = (struct net *)__ctl->extra1;
2776
2777         if (write) {
2778                 rt_cache_flush(net);
2779                 fnhe_genid_bump(net);
2780                 return 0;
2781         }
2782
2783         return -EINVAL;
2784 }
2785
2786 static struct ctl_table ipv4_route_table[] = {
2787         {
2788                 .procname       = "gc_thresh",
2789                 .data           = &ipv4_dst_ops.gc_thresh,
2790                 .maxlen         = sizeof(int),
2791                 .mode           = 0644,
2792                 .proc_handler   = proc_dointvec,
2793         },
2794         {
2795                 .procname       = "max_size",
2796                 .data           = &ip_rt_max_size,
2797                 .maxlen         = sizeof(int),
2798                 .mode           = 0644,
2799                 .proc_handler   = proc_dointvec,
2800         },
2801         {
2802                 /*  Deprecated. Use gc_min_interval_ms */
2803
2804                 .procname       = "gc_min_interval",
2805                 .data           = &ip_rt_gc_min_interval,
2806                 .maxlen         = sizeof(int),
2807                 .mode           = 0644,
2808                 .proc_handler   = proc_dointvec_jiffies,
2809         },
2810         {
2811                 .procname       = "gc_min_interval_ms",
2812                 .data           = &ip_rt_gc_min_interval,
2813                 .maxlen         = sizeof(int),
2814                 .mode           = 0644,
2815                 .proc_handler   = proc_dointvec_ms_jiffies,
2816         },
2817         {
2818                 .procname       = "gc_timeout",
2819                 .data           = &ip_rt_gc_timeout,
2820                 .maxlen         = sizeof(int),
2821                 .mode           = 0644,
2822                 .proc_handler   = proc_dointvec_jiffies,
2823         },
2824         {
2825                 .procname       = "gc_interval",
2826                 .data           = &ip_rt_gc_interval,
2827                 .maxlen         = sizeof(int),
2828                 .mode           = 0644,
2829                 .proc_handler   = proc_dointvec_jiffies,
2830         },
2831         {
2832                 .procname       = "redirect_load",
2833                 .data           = &ip_rt_redirect_load,
2834                 .maxlen         = sizeof(int),
2835                 .mode           = 0644,
2836                 .proc_handler   = proc_dointvec,
2837         },
2838         {
2839                 .procname       = "redirect_number",
2840                 .data           = &ip_rt_redirect_number,
2841                 .maxlen         = sizeof(int),
2842                 .mode           = 0644,
2843                 .proc_handler   = proc_dointvec,
2844         },
2845         {
2846                 .procname       = "redirect_silence",
2847                 .data           = &ip_rt_redirect_silence,
2848                 .maxlen         = sizeof(int),
2849                 .mode           = 0644,
2850                 .proc_handler   = proc_dointvec,
2851         },
2852         {
2853                 .procname       = "error_cost",
2854                 .data           = &ip_rt_error_cost,
2855                 .maxlen         = sizeof(int),
2856                 .mode           = 0644,
2857                 .proc_handler   = proc_dointvec,
2858         },
2859         {
2860                 .procname       = "error_burst",
2861                 .data           = &ip_rt_error_burst,
2862                 .maxlen         = sizeof(int),
2863                 .mode           = 0644,
2864                 .proc_handler   = proc_dointvec,
2865         },
2866         {
2867                 .procname       = "gc_elasticity",
2868                 .data           = &ip_rt_gc_elasticity,
2869                 .maxlen         = sizeof(int),
2870                 .mode           = 0644,
2871                 .proc_handler   = proc_dointvec,
2872         },
2873         {
2874                 .procname       = "mtu_expires",
2875                 .data           = &ip_rt_mtu_expires,
2876                 .maxlen         = sizeof(int),
2877                 .mode           = 0644,
2878                 .proc_handler   = proc_dointvec_jiffies,
2879         },
2880         {
2881                 .procname       = "min_pmtu",
2882                 .data           = &ip_rt_min_pmtu,
2883                 .maxlen         = sizeof(int),
2884                 .mode           = 0644,
2885                 .proc_handler   = proc_dointvec,
2886         },
2887         {
2888                 .procname       = "min_adv_mss",
2889                 .data           = &ip_rt_min_advmss,
2890                 .maxlen         = sizeof(int),
2891                 .mode           = 0644,
2892                 .proc_handler   = proc_dointvec,
2893         },
2894         { }
2895 };
2896
2897 static struct ctl_table ipv4_route_flush_table[] = {
2898         {
2899                 .procname       = "flush",
2900                 .maxlen         = sizeof(int),
2901                 .mode           = 0200,
2902                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2903         },
2904         { },
2905 };
2906
2907 static __net_init int sysctl_route_net_init(struct net *net)
2908 {
2909         struct ctl_table *tbl;
2910
2911         tbl = ipv4_route_flush_table;
2912         if (!net_eq(net, &init_net)) {
2913                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2914                 if (!tbl)
2915                         goto err_dup;
2916
2917                 /* Don't export sysctls to unprivileged users */
2918                 if (net->user_ns != &init_user_ns)
2919                         tbl[0].procname = NULL;
2920         }
2921         tbl[0].extra1 = net;
2922
2923         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2924         if (!net->ipv4.route_hdr)
2925                 goto err_reg;
2926         return 0;
2927
2928 err_reg:
2929         if (tbl != ipv4_route_flush_table)
2930                 kfree(tbl);
2931 err_dup:
2932         return -ENOMEM;
2933 }
2934
2935 static __net_exit void sysctl_route_net_exit(struct net *net)
2936 {
2937         struct ctl_table *tbl;
2938
2939         tbl = net->ipv4.route_hdr->ctl_table_arg;
2940         unregister_net_sysctl_table(net->ipv4.route_hdr);
2941         BUG_ON(tbl == ipv4_route_flush_table);
2942         kfree(tbl);
2943 }
2944
2945 static __net_initdata struct pernet_operations sysctl_route_ops = {
2946         .init = sysctl_route_net_init,
2947         .exit = sysctl_route_net_exit,
2948 };
2949 #endif
2950
2951 static __net_init int rt_genid_init(struct net *net)
2952 {
2953         atomic_set(&net->ipv4.rt_genid, 0);
2954         atomic_set(&net->fnhe_genid, 0);
2955         get_random_bytes(&net->ipv4.dev_addr_genid,
2956                          sizeof(net->ipv4.dev_addr_genid));
2957         return 0;
2958 }
2959
2960 static __net_initdata struct pernet_operations rt_genid_ops = {
2961         .init = rt_genid_init,
2962 };
2963
2964 static int __net_init ipv4_inetpeer_init(struct net *net)
2965 {
2966         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2967
2968         if (!bp)
2969                 return -ENOMEM;
2970         inet_peer_base_init(bp);
2971         net->ipv4.peers = bp;
2972         return 0;
2973 }
2974
2975 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2976 {
2977         struct inet_peer_base *bp = net->ipv4.peers;
2978
2979         net->ipv4.peers = NULL;
2980         inetpeer_invalidate_tree(bp);
2981         kfree(bp);
2982 }
2983
2984 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2985         .init   =       ipv4_inetpeer_init,
2986         .exit   =       ipv4_inetpeer_exit,
2987 };
2988
2989 #ifdef CONFIG_IP_ROUTE_CLASSID
2990 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2991 #endif /* CONFIG_IP_ROUTE_CLASSID */
2992
2993 int __init ip_rt_init(void)
2994 {
2995         int rc = 0;
2996         int cpu;
2997
2998         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2999         if (!ip_idents)
3000                 panic("IP: failed to allocate ip_idents\n");
3001
3002         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3003
3004         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3005         if (!ip_tstamps)
3006                 panic("IP: failed to allocate ip_tstamps\n");
3007
3008         for_each_possible_cpu(cpu) {
3009                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3010
3011                 INIT_LIST_HEAD(&ul->head);
3012                 spin_lock_init(&ul->lock);
3013         }
3014 #ifdef CONFIG_IP_ROUTE_CLASSID
3015         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3016         if (!ip_rt_acct)
3017                 panic("IP: failed to allocate ip_rt_acct\n");
3018 #endif
3019
3020         ipv4_dst_ops.kmem_cachep =
3021                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3022                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3023
3024         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3025
3026         if (dst_entries_init(&ipv4_dst_ops) < 0)
3027                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3028
3029         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3030                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3031
3032         ipv4_dst_ops.gc_thresh = ~0;
3033         ip_rt_max_size = INT_MAX;
3034
3035         devinet_init();
3036         ip_fib_init();
3037
3038         if (ip_rt_proc_init())
3039                 pr_err("Unable to create route proc files\n");
3040 #ifdef CONFIG_XFRM
3041         xfrm_init();
3042         xfrm4_init();
3043 #endif
3044         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3045
3046 #ifdef CONFIG_SYSCTL
3047         register_pernet_subsys(&sysctl_route_ops);
3048 #endif
3049         register_pernet_subsys(&rt_genid_ops);
3050         register_pernet_subsys(&ipv4_inetpeer_ops);
3051         return rc;
3052 }
3053
3054 #ifdef CONFIG_SYSCTL
3055 /*
3056  * We really need to sanitize the damn ipv4 init order, then all
3057  * this nonsense will go away.
3058  */
3059 void __init ip_static_sysctl_init(void)
3060 {
3061         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3062 }
3063 #endif