]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv4/route.c
ipv4: Make neigh lookups directly in output packet path.
[karo-tx-linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
113 #endif
114 #include <net/secure_seq.h>
115
116 #define RT_FL_TOS(oldflp4) \
117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define IP_MAX_MTU      0xFFF0
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
127 static int ip_rt_redirect_number __read_mostly  = 9;
128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly       = HZ;
131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly    = 8;
133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly       = 256;
136 static int rt_chain_length_max __read_mostly    = 20;
137
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
148 static void              ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153
154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
155                             int how)
156 {
157 }
158
159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160 {
161         struct rtable *rt = (struct rtable *) dst;
162         struct inet_peer *peer;
163         u32 *p = NULL;
164
165         peer = rt_get_peer_create(rt, rt->rt_dst);
166         if (peer) {
167                 u32 *old_p = __DST_METRICS_PTR(old);
168                 unsigned long prev, new;
169
170                 p = peer->metrics;
171                 if (inet_metrics_new(peer))
172                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
173
174                 new = (unsigned long) p;
175                 prev = cmpxchg(&dst->_metrics, old, new);
176
177                 if (prev != old) {
178                         p = __DST_METRICS_PTR(prev);
179                         if (prev & DST_METRICS_READ_ONLY)
180                                 p = NULL;
181                 } else {
182                         if (rt->fi) {
183                                 fib_info_put(rt->fi);
184                                 rt->fi = NULL;
185                         }
186                 }
187         }
188         return p;
189 }
190
191 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
192
193 static struct dst_ops ipv4_dst_ops = {
194         .family =               AF_INET,
195         .protocol =             cpu_to_be16(ETH_P_IP),
196         .gc =                   rt_garbage_collect,
197         .check =                ipv4_dst_check,
198         .default_advmss =       ipv4_default_advmss,
199         .mtu =                  ipv4_mtu,
200         .cow_metrics =          ipv4_cow_metrics,
201         .destroy =              ipv4_dst_destroy,
202         .ifdown =               ipv4_dst_ifdown,
203         .negative_advice =      ipv4_negative_advice,
204         .link_failure =         ipv4_link_failure,
205         .update_pmtu =          ip_rt_update_pmtu,
206         .local_out =            __ip_local_out,
207         .neigh_lookup =         ipv4_neigh_lookup,
208 };
209
210 #define ECN_OR_COST(class)      TC_PRIO_##class
211
212 const __u8 ip_tos2prio[16] = {
213         TC_PRIO_BESTEFFORT,
214         ECN_OR_COST(BESTEFFORT),
215         TC_PRIO_BESTEFFORT,
216         ECN_OR_COST(BESTEFFORT),
217         TC_PRIO_BULK,
218         ECN_OR_COST(BULK),
219         TC_PRIO_BULK,
220         ECN_OR_COST(BULK),
221         TC_PRIO_INTERACTIVE,
222         ECN_OR_COST(INTERACTIVE),
223         TC_PRIO_INTERACTIVE,
224         ECN_OR_COST(INTERACTIVE),
225         TC_PRIO_INTERACTIVE_BULK,
226         ECN_OR_COST(INTERACTIVE_BULK),
227         TC_PRIO_INTERACTIVE_BULK,
228         ECN_OR_COST(INTERACTIVE_BULK)
229 };
230 EXPORT_SYMBOL(ip_tos2prio);
231
232 /*
233  * Route cache.
234  */
235
236 /* The locking scheme is rather straight forward:
237  *
238  * 1) Read-Copy Update protects the buckets of the central route hash.
239  * 2) Only writers remove entries, and they hold the lock
240  *    as they look at rtable reference counts.
241  * 3) Only readers acquire references to rtable entries,
242  *    they do so with atomic increments and with the
243  *    lock held.
244  */
245
246 struct rt_hash_bucket {
247         struct rtable __rcu     *chain;
248 };
249
250 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
251         defined(CONFIG_PROVE_LOCKING)
252 /*
253  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
254  * The size of this table is a power of two and depends on the number of CPUS.
255  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
256  */
257 #ifdef CONFIG_LOCKDEP
258 # define RT_HASH_LOCK_SZ        256
259 #else
260 # if NR_CPUS >= 32
261 #  define RT_HASH_LOCK_SZ       4096
262 # elif NR_CPUS >= 16
263 #  define RT_HASH_LOCK_SZ       2048
264 # elif NR_CPUS >= 8
265 #  define RT_HASH_LOCK_SZ       1024
266 # elif NR_CPUS >= 4
267 #  define RT_HASH_LOCK_SZ       512
268 # else
269 #  define RT_HASH_LOCK_SZ       256
270 # endif
271 #endif
272
273 static spinlock_t       *rt_hash_locks;
274 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
275
276 static __init void rt_hash_lock_init(void)
277 {
278         int i;
279
280         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
281                         GFP_KERNEL);
282         if (!rt_hash_locks)
283                 panic("IP: failed to allocate rt_hash_locks\n");
284
285         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
286                 spin_lock_init(&rt_hash_locks[i]);
287 }
288 #else
289 # define rt_hash_lock_addr(slot) NULL
290
291 static inline void rt_hash_lock_init(void)
292 {
293 }
294 #endif
295
296 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
297 static unsigned int             rt_hash_mask __read_mostly;
298 static unsigned int             rt_hash_log  __read_mostly;
299
300 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
301 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
302
303 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
304                                    int genid)
305 {
306         return jhash_3words((__force u32)daddr, (__force u32)saddr,
307                             idx, genid)
308                 & rt_hash_mask;
309 }
310
311 static inline int rt_genid(struct net *net)
312 {
313         return atomic_read(&net->ipv4.rt_genid);
314 }
315
316 #ifdef CONFIG_PROC_FS
317 struct rt_cache_iter_state {
318         struct seq_net_private p;
319         int bucket;
320         int genid;
321 };
322
323 static struct rtable *rt_cache_get_first(struct seq_file *seq)
324 {
325         struct rt_cache_iter_state *st = seq->private;
326         struct rtable *r = NULL;
327
328         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
329                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
330                         continue;
331                 rcu_read_lock_bh();
332                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
333                 while (r) {
334                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
335                             r->rt_genid == st->genid)
336                                 return r;
337                         r = rcu_dereference_bh(r->dst.rt_next);
338                 }
339                 rcu_read_unlock_bh();
340         }
341         return r;
342 }
343
344 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
345                                           struct rtable *r)
346 {
347         struct rt_cache_iter_state *st = seq->private;
348
349         r = rcu_dereference_bh(r->dst.rt_next);
350         while (!r) {
351                 rcu_read_unlock_bh();
352                 do {
353                         if (--st->bucket < 0)
354                                 return NULL;
355                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
356                 rcu_read_lock_bh();
357                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
358         }
359         return r;
360 }
361
362 static struct rtable *rt_cache_get_next(struct seq_file *seq,
363                                         struct rtable *r)
364 {
365         struct rt_cache_iter_state *st = seq->private;
366         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
367                 if (dev_net(r->dst.dev) != seq_file_net(seq))
368                         continue;
369                 if (r->rt_genid == st->genid)
370                         break;
371         }
372         return r;
373 }
374
375 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
376 {
377         struct rtable *r = rt_cache_get_first(seq);
378
379         if (r)
380                 while (pos && (r = rt_cache_get_next(seq, r)))
381                         --pos;
382         return pos ? NULL : r;
383 }
384
385 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
386 {
387         struct rt_cache_iter_state *st = seq->private;
388         if (*pos)
389                 return rt_cache_get_idx(seq, *pos - 1);
390         st->genid = rt_genid(seq_file_net(seq));
391         return SEQ_START_TOKEN;
392 }
393
394 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
395 {
396         struct rtable *r;
397
398         if (v == SEQ_START_TOKEN)
399                 r = rt_cache_get_first(seq);
400         else
401                 r = rt_cache_get_next(seq, v);
402         ++*pos;
403         return r;
404 }
405
406 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
407 {
408         if (v && v != SEQ_START_TOKEN)
409                 rcu_read_unlock_bh();
410 }
411
412 static int rt_cache_seq_show(struct seq_file *seq, void *v)
413 {
414         if (v == SEQ_START_TOKEN)
415                 seq_printf(seq, "%-127s\n",
416                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
417                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
418                            "HHUptod\tSpecDst");
419         else {
420                 struct rtable *r = v;
421                 struct neighbour *n;
422                 int len, HHUptod;
423
424                 rcu_read_lock();
425                 n = dst_get_neighbour_noref(&r->dst);
426                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
427                 rcu_read_unlock();
428
429                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
430                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
431                         r->dst.dev ? r->dst.dev->name : "*",
432                         (__force u32)r->rt_dst,
433                         (__force u32)r->rt_gateway,
434                         r->rt_flags, atomic_read(&r->dst.__refcnt),
435                         r->dst.__use, 0, (__force u32)r->rt_src,
436                         dst_metric_advmss(&r->dst) + 40,
437                         dst_metric(&r->dst, RTAX_WINDOW),
438                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
439                               dst_metric(&r->dst, RTAX_RTTVAR)),
440                         r->rt_key_tos,
441                         -1,
442                         HHUptod,
443                         0, &len);
444
445                 seq_printf(seq, "%*s\n", 127 - len, "");
446         }
447         return 0;
448 }
449
450 static const struct seq_operations rt_cache_seq_ops = {
451         .start  = rt_cache_seq_start,
452         .next   = rt_cache_seq_next,
453         .stop   = rt_cache_seq_stop,
454         .show   = rt_cache_seq_show,
455 };
456
457 static int rt_cache_seq_open(struct inode *inode, struct file *file)
458 {
459         return seq_open_net(inode, file, &rt_cache_seq_ops,
460                         sizeof(struct rt_cache_iter_state));
461 }
462
463 static const struct file_operations rt_cache_seq_fops = {
464         .owner   = THIS_MODULE,
465         .open    = rt_cache_seq_open,
466         .read    = seq_read,
467         .llseek  = seq_lseek,
468         .release = seq_release_net,
469 };
470
471
472 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
473 {
474         int cpu;
475
476         if (*pos == 0)
477                 return SEQ_START_TOKEN;
478
479         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
480                 if (!cpu_possible(cpu))
481                         continue;
482                 *pos = cpu+1;
483                 return &per_cpu(rt_cache_stat, cpu);
484         }
485         return NULL;
486 }
487
488 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
489 {
490         int cpu;
491
492         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
493                 if (!cpu_possible(cpu))
494                         continue;
495                 *pos = cpu+1;
496                 return &per_cpu(rt_cache_stat, cpu);
497         }
498         return NULL;
499
500 }
501
502 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
503 {
504
505 }
506
507 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
508 {
509         struct rt_cache_stat *st = v;
510
511         if (v == SEQ_START_TOKEN) {
512                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
513                 return 0;
514         }
515
516         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
517                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
518                    dst_entries_get_slow(&ipv4_dst_ops),
519                    st->in_hit,
520                    st->in_slow_tot,
521                    st->in_slow_mc,
522                    st->in_no_route,
523                    st->in_brd,
524                    st->in_martian_dst,
525                    st->in_martian_src,
526
527                    st->out_hit,
528                    st->out_slow_tot,
529                    st->out_slow_mc,
530
531                    st->gc_total,
532                    st->gc_ignored,
533                    st->gc_goal_miss,
534                    st->gc_dst_overflow,
535                    st->in_hlist_search,
536                    st->out_hlist_search
537                 );
538         return 0;
539 }
540
541 static const struct seq_operations rt_cpu_seq_ops = {
542         .start  = rt_cpu_seq_start,
543         .next   = rt_cpu_seq_next,
544         .stop   = rt_cpu_seq_stop,
545         .show   = rt_cpu_seq_show,
546 };
547
548
549 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
550 {
551         return seq_open(file, &rt_cpu_seq_ops);
552 }
553
554 static const struct file_operations rt_cpu_seq_fops = {
555         .owner   = THIS_MODULE,
556         .open    = rt_cpu_seq_open,
557         .read    = seq_read,
558         .llseek  = seq_lseek,
559         .release = seq_release,
560 };
561
562 #ifdef CONFIG_IP_ROUTE_CLASSID
563 static int rt_acct_proc_show(struct seq_file *m, void *v)
564 {
565         struct ip_rt_acct *dst, *src;
566         unsigned int i, j;
567
568         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
569         if (!dst)
570                 return -ENOMEM;
571
572         for_each_possible_cpu(i) {
573                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
574                 for (j = 0; j < 256; j++) {
575                         dst[j].o_bytes   += src[j].o_bytes;
576                         dst[j].o_packets += src[j].o_packets;
577                         dst[j].i_bytes   += src[j].i_bytes;
578                         dst[j].i_packets += src[j].i_packets;
579                 }
580         }
581
582         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
583         kfree(dst);
584         return 0;
585 }
586
587 static int rt_acct_proc_open(struct inode *inode, struct file *file)
588 {
589         return single_open(file, rt_acct_proc_show, NULL);
590 }
591
592 static const struct file_operations rt_acct_proc_fops = {
593         .owner          = THIS_MODULE,
594         .open           = rt_acct_proc_open,
595         .read           = seq_read,
596         .llseek         = seq_lseek,
597         .release        = single_release,
598 };
599 #endif
600
601 static int __net_init ip_rt_do_proc_init(struct net *net)
602 {
603         struct proc_dir_entry *pde;
604
605         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
606                         &rt_cache_seq_fops);
607         if (!pde)
608                 goto err1;
609
610         pde = proc_create("rt_cache", S_IRUGO,
611                           net->proc_net_stat, &rt_cpu_seq_fops);
612         if (!pde)
613                 goto err2;
614
615 #ifdef CONFIG_IP_ROUTE_CLASSID
616         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
617         if (!pde)
618                 goto err3;
619 #endif
620         return 0;
621
622 #ifdef CONFIG_IP_ROUTE_CLASSID
623 err3:
624         remove_proc_entry("rt_cache", net->proc_net_stat);
625 #endif
626 err2:
627         remove_proc_entry("rt_cache", net->proc_net);
628 err1:
629         return -ENOMEM;
630 }
631
632 static void __net_exit ip_rt_do_proc_exit(struct net *net)
633 {
634         remove_proc_entry("rt_cache", net->proc_net_stat);
635         remove_proc_entry("rt_cache", net->proc_net);
636 #ifdef CONFIG_IP_ROUTE_CLASSID
637         remove_proc_entry("rt_acct", net->proc_net);
638 #endif
639 }
640
641 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
642         .init = ip_rt_do_proc_init,
643         .exit = ip_rt_do_proc_exit,
644 };
645
646 static int __init ip_rt_proc_init(void)
647 {
648         return register_pernet_subsys(&ip_rt_proc_ops);
649 }
650
651 #else
652 static inline int ip_rt_proc_init(void)
653 {
654         return 0;
655 }
656 #endif /* CONFIG_PROC_FS */
657
658 static inline void rt_free(struct rtable *rt)
659 {
660         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
661 }
662
663 static inline void rt_drop(struct rtable *rt)
664 {
665         ip_rt_put(rt);
666         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
667 }
668
669 static inline int rt_fast_clean(struct rtable *rth)
670 {
671         /* Kill broadcast/multicast entries very aggresively, if they
672            collide in hash table with more useful entries */
673         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
674                 rt_is_input_route(rth) && rth->dst.rt_next;
675 }
676
677 static inline int rt_valuable(struct rtable *rth)
678 {
679         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
680                 (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
681 }
682
683 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
684 {
685         unsigned long age;
686         int ret = 0;
687
688         if (atomic_read(&rth->dst.__refcnt))
689                 goto out;
690
691         age = jiffies - rth->dst.lastuse;
692         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
693             (age <= tmo2 && rt_valuable(rth)))
694                 goto out;
695         ret = 1;
696 out:    return ret;
697 }
698
699 /* Bits of score are:
700  * 31: very valuable
701  * 30: not quite useless
702  * 29..0: usage counter
703  */
704 static inline u32 rt_score(struct rtable *rt)
705 {
706         u32 score = jiffies - rt->dst.lastuse;
707
708         score = ~score & ~(3<<30);
709
710         if (rt_valuable(rt))
711                 score |= (1<<31);
712
713         if (rt_is_output_route(rt) ||
714             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
715                 score |= (1<<30);
716
717         return score;
718 }
719
720 static inline bool rt_caching(const struct net *net)
721 {
722         return net->ipv4.current_rt_cache_rebuild_count <=
723                 net->ipv4.sysctl_rt_cache_rebuild_count;
724 }
725
726 static inline bool compare_hash_inputs(const struct rtable *rt1,
727                                        const struct rtable *rt2)
728 {
729         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
730                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
731                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
732 }
733
734 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
735 {
736         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
737                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
738                 (rt1->rt_mark ^ rt2->rt_mark) |
739                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
740                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
741                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
742 }
743
744 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
745 {
746         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
747 }
748
749 static inline int rt_is_expired(struct rtable *rth)
750 {
751         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
752 }
753
754 /*
755  * Perform a full scan of hash table and free all entries.
756  * Can be called by a softirq or a process.
757  * In the later case, we want to be reschedule if necessary
758  */
759 static void rt_do_flush(struct net *net, int process_context)
760 {
761         unsigned int i;
762         struct rtable *rth, *next;
763
764         for (i = 0; i <= rt_hash_mask; i++) {
765                 struct rtable __rcu **pprev;
766                 struct rtable *list;
767
768                 if (process_context && need_resched())
769                         cond_resched();
770                 rth = rcu_access_pointer(rt_hash_table[i].chain);
771                 if (!rth)
772                         continue;
773
774                 spin_lock_bh(rt_hash_lock_addr(i));
775
776                 list = NULL;
777                 pprev = &rt_hash_table[i].chain;
778                 rth = rcu_dereference_protected(*pprev,
779                         lockdep_is_held(rt_hash_lock_addr(i)));
780
781                 while (rth) {
782                         next = rcu_dereference_protected(rth->dst.rt_next,
783                                 lockdep_is_held(rt_hash_lock_addr(i)));
784
785                         if (!net ||
786                             net_eq(dev_net(rth->dst.dev), net)) {
787                                 rcu_assign_pointer(*pprev, next);
788                                 rcu_assign_pointer(rth->dst.rt_next, list);
789                                 list = rth;
790                         } else {
791                                 pprev = &rth->dst.rt_next;
792                         }
793                         rth = next;
794                 }
795
796                 spin_unlock_bh(rt_hash_lock_addr(i));
797
798                 for (; list; list = next) {
799                         next = rcu_dereference_protected(list->dst.rt_next, 1);
800                         rt_free(list);
801                 }
802         }
803 }
804
805 /*
806  * While freeing expired entries, we compute average chain length
807  * and standard deviation, using fixed-point arithmetic.
808  * This to have an estimation of rt_chain_length_max
809  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
810  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
811  */
812
813 #define FRACT_BITS 3
814 #define ONE (1UL << FRACT_BITS)
815
816 /*
817  * Given a hash chain and an item in this hash chain,
818  * find if a previous entry has the same hash_inputs
819  * (but differs on tos, mark or oif)
820  * Returns 0 if an alias is found.
821  * Returns ONE if rth has no alias before itself.
822  */
823 static int has_noalias(const struct rtable *head, const struct rtable *rth)
824 {
825         const struct rtable *aux = head;
826
827         while (aux != rth) {
828                 if (compare_hash_inputs(aux, rth))
829                         return 0;
830                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
831         }
832         return ONE;
833 }
834
835 static void rt_check_expire(void)
836 {
837         static unsigned int rover;
838         unsigned int i = rover, goal;
839         struct rtable *rth;
840         struct rtable __rcu **rthp;
841         unsigned long samples = 0;
842         unsigned long sum = 0, sum2 = 0;
843         unsigned long delta;
844         u64 mult;
845
846         delta = jiffies - expires_ljiffies;
847         expires_ljiffies = jiffies;
848         mult = ((u64)delta) << rt_hash_log;
849         if (ip_rt_gc_timeout > 1)
850                 do_div(mult, ip_rt_gc_timeout);
851         goal = (unsigned int)mult;
852         if (goal > rt_hash_mask)
853                 goal = rt_hash_mask + 1;
854         for (; goal > 0; goal--) {
855                 unsigned long tmo = ip_rt_gc_timeout;
856                 unsigned long length;
857
858                 i = (i + 1) & rt_hash_mask;
859                 rthp = &rt_hash_table[i].chain;
860
861                 if (need_resched())
862                         cond_resched();
863
864                 samples++;
865
866                 if (rcu_dereference_raw(*rthp) == NULL)
867                         continue;
868                 length = 0;
869                 spin_lock_bh(rt_hash_lock_addr(i));
870                 while ((rth = rcu_dereference_protected(*rthp,
871                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
872                         prefetch(rth->dst.rt_next);
873                         if (rt_is_expired(rth) ||
874                             rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
875                                 *rthp = rth->dst.rt_next;
876                                 rt_free(rth);
877                                 continue;
878                         }
879
880                         /* We only count entries on a chain with equal
881                          * hash inputs once so that entries for
882                          * different QOS levels, and other non-hash
883                          * input attributes don't unfairly skew the
884                          * length computation
885                          */
886                         tmo >>= 1;
887                         rthp = &rth->dst.rt_next;
888                         length += has_noalias(rt_hash_table[i].chain, rth);
889                 }
890                 spin_unlock_bh(rt_hash_lock_addr(i));
891                 sum += length;
892                 sum2 += length*length;
893         }
894         if (samples) {
895                 unsigned long avg = sum / samples;
896                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
897                 rt_chain_length_max = max_t(unsigned long,
898                                         ip_rt_gc_elasticity,
899                                         (avg + 4*sd) >> FRACT_BITS);
900         }
901         rover = i;
902 }
903
904 /*
905  * rt_worker_func() is run in process context.
906  * we call rt_check_expire() to scan part of the hash table
907  */
908 static void rt_worker_func(struct work_struct *work)
909 {
910         rt_check_expire();
911         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
912 }
913
914 /*
915  * Perturbation of rt_genid by a small quantity [1..256]
916  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
917  * many times (2^24) without giving recent rt_genid.
918  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
919  */
920 static void rt_cache_invalidate(struct net *net)
921 {
922         unsigned char shuffle;
923
924         get_random_bytes(&shuffle, sizeof(shuffle));
925         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
926         inetpeer_invalidate_family(AF_INET);
927 }
928
929 /*
930  * delay < 0  : invalidate cache (fast : entries will be deleted later)
931  * delay >= 0 : invalidate & flush cache (can be long)
932  */
933 void rt_cache_flush(struct net *net, int delay)
934 {
935         rt_cache_invalidate(net);
936         if (delay >= 0)
937                 rt_do_flush(net, !in_softirq());
938 }
939
940 /* Flush previous cache invalidated entries from the cache */
941 void rt_cache_flush_batch(struct net *net)
942 {
943         rt_do_flush(net, !in_softirq());
944 }
945
946 static void rt_emergency_hash_rebuild(struct net *net)
947 {
948         net_warn_ratelimited("Route hash chain too long!\n");
949         rt_cache_invalidate(net);
950 }
951
952 /*
953    Short description of GC goals.
954
955    We want to build algorithm, which will keep routing cache
956    at some equilibrium point, when number of aged off entries
957    is kept approximately equal to newly generated ones.
958
959    Current expiration strength is variable "expire".
960    We try to adjust it dynamically, so that if networking
961    is idle expires is large enough to keep enough of warm entries,
962    and when load increases it reduces to limit cache size.
963  */
964
965 static int rt_garbage_collect(struct dst_ops *ops)
966 {
967         static unsigned long expire = RT_GC_TIMEOUT;
968         static unsigned long last_gc;
969         static int rover;
970         static int equilibrium;
971         struct rtable *rth;
972         struct rtable __rcu **rthp;
973         unsigned long now = jiffies;
974         int goal;
975         int entries = dst_entries_get_fast(&ipv4_dst_ops);
976
977         /*
978          * Garbage collection is pretty expensive,
979          * do not make it too frequently.
980          */
981
982         RT_CACHE_STAT_INC(gc_total);
983
984         if (now - last_gc < ip_rt_gc_min_interval &&
985             entries < ip_rt_max_size) {
986                 RT_CACHE_STAT_INC(gc_ignored);
987                 goto out;
988         }
989
990         entries = dst_entries_get_slow(&ipv4_dst_ops);
991         /* Calculate number of entries, which we want to expire now. */
992         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
993         if (goal <= 0) {
994                 if (equilibrium < ipv4_dst_ops.gc_thresh)
995                         equilibrium = ipv4_dst_ops.gc_thresh;
996                 goal = entries - equilibrium;
997                 if (goal > 0) {
998                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
999                         goal = entries - equilibrium;
1000                 }
1001         } else {
1002                 /* We are in dangerous area. Try to reduce cache really
1003                  * aggressively.
1004                  */
1005                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1006                 equilibrium = entries - goal;
1007         }
1008
1009         if (now - last_gc >= ip_rt_gc_min_interval)
1010                 last_gc = now;
1011
1012         if (goal <= 0) {
1013                 equilibrium += goal;
1014                 goto work_done;
1015         }
1016
1017         do {
1018                 int i, k;
1019
1020                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1021                         unsigned long tmo = expire;
1022
1023                         k = (k + 1) & rt_hash_mask;
1024                         rthp = &rt_hash_table[k].chain;
1025                         spin_lock_bh(rt_hash_lock_addr(k));
1026                         while ((rth = rcu_dereference_protected(*rthp,
1027                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1028                                 if (!rt_is_expired(rth) &&
1029                                         !rt_may_expire(rth, tmo, expire)) {
1030                                         tmo >>= 1;
1031                                         rthp = &rth->dst.rt_next;
1032                                         continue;
1033                                 }
1034                                 *rthp = rth->dst.rt_next;
1035                                 rt_free(rth);
1036                                 goal--;
1037                         }
1038                         spin_unlock_bh(rt_hash_lock_addr(k));
1039                         if (goal <= 0)
1040                                 break;
1041                 }
1042                 rover = k;
1043
1044                 if (goal <= 0)
1045                         goto work_done;
1046
1047                 /* Goal is not achieved. We stop process if:
1048
1049                    - if expire reduced to zero. Otherwise, expire is halfed.
1050                    - if table is not full.
1051                    - if we are called from interrupt.
1052                    - jiffies check is just fallback/debug loop breaker.
1053                      We will not spin here for long time in any case.
1054                  */
1055
1056                 RT_CACHE_STAT_INC(gc_goal_miss);
1057
1058                 if (expire == 0)
1059                         break;
1060
1061                 expire >>= 1;
1062
1063                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1064                         goto out;
1065         } while (!in_softirq() && time_before_eq(jiffies, now));
1066
1067         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1068                 goto out;
1069         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1070                 goto out;
1071         net_warn_ratelimited("dst cache overflow\n");
1072         RT_CACHE_STAT_INC(gc_dst_overflow);
1073         return 1;
1074
1075 work_done:
1076         expire += ip_rt_gc_min_interval;
1077         if (expire > ip_rt_gc_timeout ||
1078             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1079             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1080                 expire = ip_rt_gc_timeout;
1081 out:    return 0;
1082 }
1083
1084 /*
1085  * Returns number of entries in a hash chain that have different hash_inputs
1086  */
1087 static int slow_chain_length(const struct rtable *head)
1088 {
1089         int length = 0;
1090         const struct rtable *rth = head;
1091
1092         while (rth) {
1093                 length += has_noalias(head, rth);
1094                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1095         }
1096         return length >> FRACT_BITS;
1097 }
1098
1099 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1100 {
1101         struct net_device *dev = dst->dev;
1102         const __be32 *pkey = daddr;
1103         const struct rtable *rt;
1104         struct neighbour *n;
1105
1106         rt = (const struct rtable *) dst;
1107         if (rt->rt_gateway)
1108                 pkey = (const __be32 *) &rt->rt_gateway;
1109
1110         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1111         if (n)
1112                 return n;
1113         return neigh_create(&arp_tbl, pkey, dev);
1114 }
1115
1116 static int rt_bind_neighbour(struct rtable *rt)
1117 {
1118         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1119         if (IS_ERR(n))
1120                 return PTR_ERR(n);
1121         dst_set_neighbour(&rt->dst, n);
1122
1123         return 0;
1124 }
1125
1126 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1127                                      struct sk_buff *skb, int ifindex)
1128 {
1129         struct rtable   *rth, *cand;
1130         struct rtable __rcu **rthp, **candp;
1131         unsigned long   now;
1132         u32             min_score;
1133         int             chain_length;
1134         int attempts = !in_softirq();
1135
1136 restart:
1137         chain_length = 0;
1138         min_score = ~(u32)0;
1139         cand = NULL;
1140         candp = NULL;
1141         now = jiffies;
1142
1143         if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1144                 /*
1145                  * If we're not caching, just tell the caller we
1146                  * were successful and don't touch the route.  The
1147                  * caller hold the sole reference to the cache entry, and
1148                  * it will be released when the caller is done with it.
1149                  * If we drop it here, the callers have no way to resolve routes
1150                  * when we're not caching.  Instead, just point *rp at rt, so
1151                  * the caller gets a single use out of the route
1152                  * Note that we do rt_free on this new route entry, so that
1153                  * once its refcount hits zero, we are still able to reap it
1154                  * (Thanks Alexey)
1155                  * Note: To avoid expensive rcu stuff for this uncached dst,
1156                  * we set DST_NOCACHE so that dst_release() can free dst without
1157                  * waiting a grace period.
1158                  */
1159
1160                 rt->dst.flags |= DST_NOCACHE;
1161                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1162                         int err = rt_bind_neighbour(rt);
1163                         if (err) {
1164                                 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1165                                 ip_rt_put(rt);
1166                                 return ERR_PTR(err);
1167                         }
1168                 }
1169
1170                 goto skip_hashing;
1171         }
1172
1173         rthp = &rt_hash_table[hash].chain;
1174
1175         spin_lock_bh(rt_hash_lock_addr(hash));
1176         while ((rth = rcu_dereference_protected(*rthp,
1177                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1178                 if (rt_is_expired(rth)) {
1179                         *rthp = rth->dst.rt_next;
1180                         rt_free(rth);
1181                         continue;
1182                 }
1183                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1184                         /* Put it first */
1185                         *rthp = rth->dst.rt_next;
1186                         /*
1187                          * Since lookup is lockfree, the deletion
1188                          * must be visible to another weakly ordered CPU before
1189                          * the insertion at the start of the hash chain.
1190                          */
1191                         rcu_assign_pointer(rth->dst.rt_next,
1192                                            rt_hash_table[hash].chain);
1193                         /*
1194                          * Since lookup is lockfree, the update writes
1195                          * must be ordered for consistency on SMP.
1196                          */
1197                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1198
1199                         dst_use(&rth->dst, now);
1200                         spin_unlock_bh(rt_hash_lock_addr(hash));
1201
1202                         rt_drop(rt);
1203                         if (skb)
1204                                 skb_dst_set(skb, &rth->dst);
1205                         return rth;
1206                 }
1207
1208                 if (!atomic_read(&rth->dst.__refcnt)) {
1209                         u32 score = rt_score(rth);
1210
1211                         if (score <= min_score) {
1212                                 cand = rth;
1213                                 candp = rthp;
1214                                 min_score = score;
1215                         }
1216                 }
1217
1218                 chain_length++;
1219
1220                 rthp = &rth->dst.rt_next;
1221         }
1222
1223         if (cand) {
1224                 /* ip_rt_gc_elasticity used to be average length of chain
1225                  * length, when exceeded gc becomes really aggressive.
1226                  *
1227                  * The second limit is less certain. At the moment it allows
1228                  * only 2 entries per bucket. We will see.
1229                  */
1230                 if (chain_length > ip_rt_gc_elasticity) {
1231                         *candp = cand->dst.rt_next;
1232                         rt_free(cand);
1233                 }
1234         } else {
1235                 if (chain_length > rt_chain_length_max &&
1236                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1237                         struct net *net = dev_net(rt->dst.dev);
1238                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1239                         if (!rt_caching(net)) {
1240                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1241                                         rt->dst.dev->name, num);
1242                         }
1243                         rt_emergency_hash_rebuild(net);
1244                         spin_unlock_bh(rt_hash_lock_addr(hash));
1245
1246                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1247                                         ifindex, rt_genid(net));
1248                         goto restart;
1249                 }
1250         }
1251
1252         /* Try to bind route to arp only if it is output
1253            route or unicast forwarding path.
1254          */
1255         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1256                 int err = rt_bind_neighbour(rt);
1257                 if (err) {
1258                         spin_unlock_bh(rt_hash_lock_addr(hash));
1259
1260                         if (err != -ENOBUFS) {
1261                                 rt_drop(rt);
1262                                 return ERR_PTR(err);
1263                         }
1264
1265                         /* Neighbour tables are full and nothing
1266                            can be released. Try to shrink route cache,
1267                            it is most likely it holds some neighbour records.
1268                          */
1269                         if (attempts-- > 0) {
1270                                 int saved_elasticity = ip_rt_gc_elasticity;
1271                                 int saved_int = ip_rt_gc_min_interval;
1272                                 ip_rt_gc_elasticity     = 1;
1273                                 ip_rt_gc_min_interval   = 0;
1274                                 rt_garbage_collect(&ipv4_dst_ops);
1275                                 ip_rt_gc_min_interval   = saved_int;
1276                                 ip_rt_gc_elasticity     = saved_elasticity;
1277                                 goto restart;
1278                         }
1279
1280                         net_warn_ratelimited("Neighbour table overflow\n");
1281                         rt_drop(rt);
1282                         return ERR_PTR(-ENOBUFS);
1283                 }
1284         }
1285
1286         rt->dst.rt_next = rt_hash_table[hash].chain;
1287
1288         /*
1289          * Since lookup is lockfree, we must make sure
1290          * previous writes to rt are committed to memory
1291          * before making rt visible to other CPUS.
1292          */
1293         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1294
1295         spin_unlock_bh(rt_hash_lock_addr(hash));
1296
1297 skip_hashing:
1298         if (skb)
1299                 skb_dst_set(skb, &rt->dst);
1300         return rt;
1301 }
1302
1303 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1304
1305 static u32 rt_peer_genid(void)
1306 {
1307         return atomic_read(&__rt_peer_genid);
1308 }
1309
1310 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1311 {
1312         struct inet_peer_base *base;
1313         struct inet_peer *peer;
1314
1315         base = inetpeer_base_ptr(rt->_peer);
1316         if (!base)
1317                 return;
1318
1319         peer = inet_getpeer_v4(base, daddr, create);
1320         if (peer) {
1321                 if (!rt_set_peer(rt, peer))
1322                         inet_putpeer(peer);
1323                 else
1324                         rt->rt_peer_genid = rt_peer_genid();
1325         }
1326 }
1327
1328 /*
1329  * Peer allocation may fail only in serious out-of-memory conditions.  However
1330  * we still can generate some output.
1331  * Random ID selection looks a bit dangerous because we have no chances to
1332  * select ID being unique in a reasonable period of time.
1333  * But broken packet identifier may be better than no packet at all.
1334  */
1335 static void ip_select_fb_ident(struct iphdr *iph)
1336 {
1337         static DEFINE_SPINLOCK(ip_fb_id_lock);
1338         static u32 ip_fallback_id;
1339         u32 salt;
1340
1341         spin_lock_bh(&ip_fb_id_lock);
1342         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1343         iph->id = htons(salt & 0xFFFF);
1344         ip_fallback_id = salt;
1345         spin_unlock_bh(&ip_fb_id_lock);
1346 }
1347
1348 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1349 {
1350         struct rtable *rt = (struct rtable *) dst;
1351
1352         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1353                 struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
1354
1355                 /* If peer is attached to destination, it is never detached,
1356                    so that we need not to grab a lock to dereference it.
1357                  */
1358                 if (peer) {
1359                         iph->id = htons(inet_getid(peer, more));
1360                         return;
1361                 }
1362         } else if (!rt)
1363                 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1364
1365         ip_select_fb_ident(iph);
1366 }
1367 EXPORT_SYMBOL(__ip_select_ident);
1368
1369 static void rt_del(unsigned int hash, struct rtable *rt)
1370 {
1371         struct rtable __rcu **rthp;
1372         struct rtable *aux;
1373
1374         rthp = &rt_hash_table[hash].chain;
1375         spin_lock_bh(rt_hash_lock_addr(hash));
1376         ip_rt_put(rt);
1377         while ((aux = rcu_dereference_protected(*rthp,
1378                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1379                 if (aux == rt || rt_is_expired(aux)) {
1380                         *rthp = aux->dst.rt_next;
1381                         rt_free(aux);
1382                         continue;
1383                 }
1384                 rthp = &aux->dst.rt_next;
1385         }
1386         spin_unlock_bh(rt_hash_lock_addr(hash));
1387 }
1388
1389 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1390 {
1391         struct rtable *rt = (struct rtable *) dst;
1392         __be32 orig_gw = rt->rt_gateway;
1393         struct neighbour *n, *old_n;
1394
1395         dst_confirm(&rt->dst);
1396
1397         rt->rt_gateway = peer->redirect_learned.a4;
1398
1399         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1400         if (IS_ERR(n)) {
1401                 rt->rt_gateway = orig_gw;
1402                 return;
1403         }
1404         old_n = xchg(&rt->dst._neighbour, n);
1405         if (old_n)
1406                 neigh_release(old_n);
1407         if (!(n->nud_state & NUD_VALID)) {
1408                 neigh_event_send(n, NULL);
1409         } else {
1410                 rt->rt_flags |= RTCF_REDIRECTED;
1411                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1412         }
1413 }
1414
1415 /* called in rcu_read_lock() section */
1416 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1417                     __be32 saddr, struct net_device *dev)
1418 {
1419         int s, i;
1420         struct in_device *in_dev = __in_dev_get_rcu(dev);
1421         __be32 skeys[2] = { saddr, 0 };
1422         int    ikeys[2] = { dev->ifindex, 0 };
1423         struct inet_peer *peer;
1424         struct net *net;
1425
1426         if (!in_dev)
1427                 return;
1428
1429         net = dev_net(dev);
1430         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1431             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1432             ipv4_is_zeronet(new_gw))
1433                 goto reject_redirect;
1434
1435         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1436                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1437                         goto reject_redirect;
1438                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1439                         goto reject_redirect;
1440         } else {
1441                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1442                         goto reject_redirect;
1443         }
1444
1445         for (s = 0; s < 2; s++) {
1446                 for (i = 0; i < 2; i++) {
1447                         unsigned int hash;
1448                         struct rtable __rcu **rthp;
1449                         struct rtable *rt;
1450
1451                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1452
1453                         rthp = &rt_hash_table[hash].chain;
1454
1455                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1456                                 rthp = &rt->dst.rt_next;
1457
1458                                 if (rt->rt_key_dst != daddr ||
1459                                     rt->rt_key_src != skeys[s] ||
1460                                     rt->rt_oif != ikeys[i] ||
1461                                     rt_is_input_route(rt) ||
1462                                     rt_is_expired(rt) ||
1463                                     !net_eq(dev_net(rt->dst.dev), net) ||
1464                                     rt->dst.error ||
1465                                     rt->dst.dev != dev ||
1466                                     rt->rt_gateway != old_gw)
1467                                         continue;
1468
1469                                 peer = rt_get_peer_create(rt, rt->rt_dst);
1470                                 if (peer) {
1471                                         if (peer->redirect_learned.a4 != new_gw) {
1472                                                 peer->redirect_learned.a4 = new_gw;
1473                                                 atomic_inc(&__rt_peer_genid);
1474                                         }
1475                                         check_peer_redir(&rt->dst, peer);
1476                                 }
1477                         }
1478                 }
1479         }
1480         return;
1481
1482 reject_redirect:
1483 #ifdef CONFIG_IP_ROUTE_VERBOSE
1484         if (IN_DEV_LOG_MARTIANS(in_dev))
1485                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1486                                      "  Advised path = %pI4 -> %pI4\n",
1487                                      &old_gw, dev->name, &new_gw,
1488                                      &saddr, &daddr);
1489 #endif
1490         ;
1491 }
1492
1493 static bool peer_pmtu_expired(struct inet_peer *peer)
1494 {
1495         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1496
1497         return orig &&
1498                time_after_eq(jiffies, orig) &&
1499                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1500 }
1501
1502 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1503 {
1504         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1505
1506         return orig &&
1507                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1508 }
1509
1510 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1511 {
1512         struct rtable *rt = (struct rtable *)dst;
1513         struct dst_entry *ret = dst;
1514
1515         if (rt) {
1516                 if (dst->obsolete > 0) {
1517                         ip_rt_put(rt);
1518                         ret = NULL;
1519                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1520                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1521                                                 rt->rt_oif,
1522                                                 rt_genid(dev_net(dst->dev)));
1523                         rt_del(hash, rt);
1524                         ret = NULL;
1525                 } else if (rt_has_peer(rt)) {
1526                         struct inet_peer *peer = rt_peer_ptr(rt);
1527                         if (peer_pmtu_expired(peer))
1528                                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1529                 }
1530         }
1531         return ret;
1532 }
1533
1534 /*
1535  * Algorithm:
1536  *      1. The first ip_rt_redirect_number redirects are sent
1537  *         with exponential backoff, then we stop sending them at all,
1538  *         assuming that the host ignores our redirects.
1539  *      2. If we did not see packets requiring redirects
1540  *         during ip_rt_redirect_silence, we assume that the host
1541  *         forgot redirected route and start to send redirects again.
1542  *
1543  * This algorithm is much cheaper and more intelligent than dumb load limiting
1544  * in icmp.c.
1545  *
1546  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1547  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1548  */
1549
1550 void ip_rt_send_redirect(struct sk_buff *skb)
1551 {
1552         struct rtable *rt = skb_rtable(skb);
1553         struct in_device *in_dev;
1554         struct inet_peer *peer;
1555         int log_martians;
1556
1557         rcu_read_lock();
1558         in_dev = __in_dev_get_rcu(rt->dst.dev);
1559         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1560                 rcu_read_unlock();
1561                 return;
1562         }
1563         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1564         rcu_read_unlock();
1565
1566         peer = rt_get_peer_create(rt, rt->rt_dst);
1567         if (!peer) {
1568                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1569                 return;
1570         }
1571
1572         /* No redirected packets during ip_rt_redirect_silence;
1573          * reset the algorithm.
1574          */
1575         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1576                 peer->rate_tokens = 0;
1577
1578         /* Too many ignored redirects; do not send anything
1579          * set dst.rate_last to the last seen redirected packet.
1580          */
1581         if (peer->rate_tokens >= ip_rt_redirect_number) {
1582                 peer->rate_last = jiffies;
1583                 return;
1584         }
1585
1586         /* Check for load limit; set rate_last to the latest sent
1587          * redirect.
1588          */
1589         if (peer->rate_tokens == 0 ||
1590             time_after(jiffies,
1591                        (peer->rate_last +
1592                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1593                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1594                 peer->rate_last = jiffies;
1595                 ++peer->rate_tokens;
1596 #ifdef CONFIG_IP_ROUTE_VERBOSE
1597                 if (log_martians &&
1598                     peer->rate_tokens == ip_rt_redirect_number)
1599                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1600                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1601                                              &rt->rt_dst, &rt->rt_gateway);
1602 #endif
1603         }
1604 }
1605
1606 static int ip_error(struct sk_buff *skb)
1607 {
1608         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1609         struct rtable *rt = skb_rtable(skb);
1610         struct inet_peer *peer;
1611         unsigned long now;
1612         struct net *net;
1613         bool send;
1614         int code;
1615
1616         net = dev_net(rt->dst.dev);
1617         if (!IN_DEV_FORWARD(in_dev)) {
1618                 switch (rt->dst.error) {
1619                 case EHOSTUNREACH:
1620                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1621                         break;
1622
1623                 case ENETUNREACH:
1624                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1625                         break;
1626                 }
1627                 goto out;
1628         }
1629
1630         switch (rt->dst.error) {
1631         case EINVAL:
1632         default:
1633                 goto out;
1634         case EHOSTUNREACH:
1635                 code = ICMP_HOST_UNREACH;
1636                 break;
1637         case ENETUNREACH:
1638                 code = ICMP_NET_UNREACH;
1639                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1640                 break;
1641         case EACCES:
1642                 code = ICMP_PKT_FILTERED;
1643                 break;
1644         }
1645
1646         peer = rt_get_peer_create(rt, rt->rt_dst);
1647
1648         send = true;
1649         if (peer) {
1650                 now = jiffies;
1651                 peer->rate_tokens += now - peer->rate_last;
1652                 if (peer->rate_tokens > ip_rt_error_burst)
1653                         peer->rate_tokens = ip_rt_error_burst;
1654                 peer->rate_last = now;
1655                 if (peer->rate_tokens >= ip_rt_error_cost)
1656                         peer->rate_tokens -= ip_rt_error_cost;
1657                 else
1658                         send = false;
1659         }
1660         if (send)
1661                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1662
1663 out:    kfree_skb(skb);
1664         return 0;
1665 }
1666
1667 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1668 {
1669         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1670
1671         if (!expires)
1672                 return;
1673         if (time_before(jiffies, expires)) {
1674                 u32 orig_dst_mtu = dst_mtu(dst);
1675                 if (peer->pmtu_learned < orig_dst_mtu) {
1676                         if (!peer->pmtu_orig)
1677                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1678                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1679                 }
1680         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1681                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1682 }
1683
1684 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1685 {
1686         struct rtable *rt = (struct rtable *) dst;
1687         struct inet_peer *peer;
1688
1689         dst_confirm(dst);
1690
1691         peer = rt_get_peer_create(rt, rt->rt_dst);
1692         if (peer) {
1693                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1694
1695                 if (mtu < ip_rt_min_pmtu)
1696                         mtu = ip_rt_min_pmtu;
1697                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1698
1699                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1700                         if (!pmtu_expires)
1701                                 pmtu_expires = 1UL;
1702
1703                         peer->pmtu_learned = mtu;
1704                         peer->pmtu_expires = pmtu_expires;
1705
1706                         atomic_inc(&__rt_peer_genid);
1707                         rt->rt_peer_genid = rt_peer_genid();
1708                 }
1709                 check_peer_pmtu(dst, peer);
1710         }
1711 }
1712
1713 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1714                       int oif, u32 mark, u8 protocol, int flow_flags)
1715 {
1716         const struct iphdr *iph = (const struct iphdr *)skb->data;
1717         struct flowi4 fl4;
1718         struct rtable *rt;
1719
1720         flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1721                            protocol, flow_flags | FLOWI_FLAG_PRECOW_METRICS,
1722                            iph->daddr, iph->saddr, 0, 0);
1723         rt = __ip_route_output_key(net, &fl4);
1724         if (!IS_ERR(rt)) {
1725                 ip_rt_update_pmtu(&rt->dst, mtu);
1726                 ip_rt_put(rt);
1727         }
1728 }
1729 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1730
1731 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1732 {
1733         const struct inet_sock *inet = inet_sk(sk);
1734
1735         return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1736                                 sk->sk_bound_dev_if, sk->sk_mark,
1737                                 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1738                                 inet_sk_flowi_flags(sk));
1739 }
1740 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1741
1742 static void ipv4_validate_peer(struct rtable *rt)
1743 {
1744         if (rt->rt_peer_genid != rt_peer_genid()) {
1745                 struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
1746
1747                 if (peer) {
1748                         check_peer_pmtu(&rt->dst, peer);
1749
1750                         if (peer->redirect_learned.a4 &&
1751                             peer->redirect_learned.a4 != rt->rt_gateway)
1752                                 check_peer_redir(&rt->dst, peer);
1753                 }
1754
1755                 rt->rt_peer_genid = rt_peer_genid();
1756         }
1757 }
1758
1759 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1760 {
1761         struct rtable *rt = (struct rtable *) dst;
1762
1763         if (rt_is_expired(rt))
1764                 return NULL;
1765         ipv4_validate_peer(rt);
1766         return dst;
1767 }
1768
1769 static void ipv4_dst_destroy(struct dst_entry *dst)
1770 {
1771         struct rtable *rt = (struct rtable *) dst;
1772
1773         if (rt->fi) {
1774                 fib_info_put(rt->fi);
1775                 rt->fi = NULL;
1776         }
1777         if (rt_has_peer(rt)) {
1778                 struct inet_peer *peer = rt_peer_ptr(rt);
1779                 inet_putpeer(peer);
1780         }
1781 }
1782
1783
1784 static void ipv4_link_failure(struct sk_buff *skb)
1785 {
1786         struct rtable *rt;
1787
1788         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1789
1790         rt = skb_rtable(skb);
1791         if (rt && rt_has_peer(rt)) {
1792                 struct inet_peer *peer = rt_peer_ptr(rt);
1793                 if (peer_pmtu_cleaned(peer))
1794                         dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
1795         }
1796 }
1797
1798 static int ip_rt_bug(struct sk_buff *skb)
1799 {
1800         pr_debug("%s: %pI4 -> %pI4, %s\n",
1801                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1802                  skb->dev ? skb->dev->name : "?");
1803         kfree_skb(skb);
1804         WARN_ON(1);
1805         return 0;
1806 }
1807
1808 /*
1809    We do not cache source address of outgoing interface,
1810    because it is used only by IP RR, TS and SRR options,
1811    so that it out of fast path.
1812
1813    BTW remember: "addr" is allowed to be not aligned
1814    in IP options!
1815  */
1816
1817 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1818 {
1819         __be32 src;
1820
1821         if (rt_is_output_route(rt))
1822                 src = ip_hdr(skb)->saddr;
1823         else {
1824                 struct fib_result res;
1825                 struct flowi4 fl4;
1826                 struct iphdr *iph;
1827
1828                 iph = ip_hdr(skb);
1829
1830                 memset(&fl4, 0, sizeof(fl4));
1831                 fl4.daddr = iph->daddr;
1832                 fl4.saddr = iph->saddr;
1833                 fl4.flowi4_tos = RT_TOS(iph->tos);
1834                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1835                 fl4.flowi4_iif = skb->dev->ifindex;
1836                 fl4.flowi4_mark = skb->mark;
1837
1838                 rcu_read_lock();
1839                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1840                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1841                 else
1842                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1843                                         RT_SCOPE_UNIVERSE);
1844                 rcu_read_unlock();
1845         }
1846         memcpy(addr, &src, 4);
1847 }
1848
1849 #ifdef CONFIG_IP_ROUTE_CLASSID
1850 static void set_class_tag(struct rtable *rt, u32 tag)
1851 {
1852         if (!(rt->dst.tclassid & 0xFFFF))
1853                 rt->dst.tclassid |= tag & 0xFFFF;
1854         if (!(rt->dst.tclassid & 0xFFFF0000))
1855                 rt->dst.tclassid |= tag & 0xFFFF0000;
1856 }
1857 #endif
1858
1859 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1860 {
1861         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1862
1863         if (advmss == 0) {
1864                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1865                                ip_rt_min_advmss);
1866                 if (advmss > 65535 - 40)
1867                         advmss = 65535 - 40;
1868         }
1869         return advmss;
1870 }
1871
1872 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1873 {
1874         const struct rtable *rt = (const struct rtable *) dst;
1875         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1876
1877         if (mtu && rt_is_output_route(rt))
1878                 return mtu;
1879
1880         mtu = dst->dev->mtu;
1881
1882         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1883
1884                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1885                         mtu = 576;
1886         }
1887
1888         if (mtu > IP_MAX_MTU)
1889                 mtu = IP_MAX_MTU;
1890
1891         return mtu;
1892 }
1893
1894 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1895                             struct fib_info *fi)
1896 {
1897         struct inet_peer_base *base;
1898         struct inet_peer *peer;
1899         int create = 0;
1900
1901         /* If a peer entry exists for this destination, we must hook
1902          * it up in order to get at cached metrics.
1903          */
1904         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1905                 create = 1;
1906
1907         base = inetpeer_base_ptr(rt->_peer);
1908         BUG_ON(!base);
1909
1910         peer = inet_getpeer_v4(base, rt->rt_dst, create);
1911         if (peer) {
1912                 __rt_set_peer(rt, peer);
1913                 rt->rt_peer_genid = rt_peer_genid();
1914                 if (inet_metrics_new(peer))
1915                         memcpy(peer->metrics, fi->fib_metrics,
1916                                sizeof(u32) * RTAX_MAX);
1917                 dst_init_metrics(&rt->dst, peer->metrics, false);
1918
1919                 check_peer_pmtu(&rt->dst, peer);
1920
1921                 if (peer->redirect_learned.a4 &&
1922                     peer->redirect_learned.a4 != rt->rt_gateway) {
1923                         rt->rt_gateway = peer->redirect_learned.a4;
1924                         rt->rt_flags |= RTCF_REDIRECTED;
1925                 }
1926         } else {
1927                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1928                         rt->fi = fi;
1929                         atomic_inc(&fi->fib_clntref);
1930                 }
1931                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1932         }
1933 }
1934
1935 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1936                            const struct fib_result *res,
1937                            struct fib_info *fi, u16 type, u32 itag)
1938 {
1939         struct dst_entry *dst = &rt->dst;
1940
1941         if (fi) {
1942                 if (FIB_RES_GW(*res) &&
1943                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1944                         rt->rt_gateway = FIB_RES_GW(*res);
1945                 rt_init_metrics(rt, fl4, fi);
1946 #ifdef CONFIG_IP_ROUTE_CLASSID
1947                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1948 #endif
1949         }
1950
1951         if (dst_mtu(dst) > IP_MAX_MTU)
1952                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1953
1954 #ifdef CONFIG_IP_ROUTE_CLASSID
1955 #ifdef CONFIG_IP_MULTIPLE_TABLES
1956         set_class_tag(rt, fib_rules_tclass(res));
1957 #endif
1958         set_class_tag(rt, itag);
1959 #endif
1960 }
1961
1962 static struct rtable *rt_dst_alloc(struct net_device *dev,
1963                                    bool nopolicy, bool noxfrm)
1964 {
1965         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1966                          DST_HOST |
1967                          (nopolicy ? DST_NOPOLICY : 0) |
1968                          (noxfrm ? DST_NOXFRM : 0));
1969 }
1970
1971 /* called in rcu_read_lock() section */
1972 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1973                                 u8 tos, struct net_device *dev, int our)
1974 {
1975         unsigned int hash;
1976         struct rtable *rth;
1977         struct in_device *in_dev = __in_dev_get_rcu(dev);
1978         u32 itag = 0;
1979         int err;
1980
1981         /* Primary sanity checks. */
1982
1983         if (in_dev == NULL)
1984                 return -EINVAL;
1985
1986         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1987             skb->protocol != htons(ETH_P_IP))
1988                 goto e_inval;
1989
1990         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1991                 if (ipv4_is_loopback(saddr))
1992                         goto e_inval;
1993
1994         if (ipv4_is_zeronet(saddr)) {
1995                 if (!ipv4_is_local_multicast(daddr))
1996                         goto e_inval;
1997         } else {
1998                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1999                                           in_dev, &itag);
2000                 if (err < 0)
2001                         goto e_err;
2002         }
2003         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2004                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2005         if (!rth)
2006                 goto e_nobufs;
2007
2008 #ifdef CONFIG_IP_ROUTE_CLASSID
2009         rth->dst.tclassid = itag;
2010 #endif
2011         rth->dst.output = ip_rt_bug;
2012
2013         rth->rt_key_dst = daddr;
2014         rth->rt_key_src = saddr;
2015         rth->rt_genid   = rt_genid(dev_net(dev));
2016         rth->rt_flags   = RTCF_MULTICAST;
2017         rth->rt_type    = RTN_MULTICAST;
2018         rth->rt_key_tos = tos;
2019         rth->rt_dst     = daddr;
2020         rth->rt_src     = saddr;
2021         rth->rt_route_iif = dev->ifindex;
2022         rth->rt_iif     = dev->ifindex;
2023         rth->rt_oif     = 0;
2024         rth->rt_mark    = skb->mark;
2025         rth->rt_gateway = daddr;
2026         rth->rt_peer_genid = 0;
2027         rt_init_peer(rth, dev_net(dev)->ipv4.peers);
2028         rth->fi = NULL;
2029         if (our) {
2030                 rth->dst.input= ip_local_deliver;
2031                 rth->rt_flags |= RTCF_LOCAL;
2032         }
2033
2034 #ifdef CONFIG_IP_MROUTE
2035         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2036                 rth->dst.input = ip_mr_input;
2037 #endif
2038         RT_CACHE_STAT_INC(in_slow_mc);
2039
2040         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2041         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2042         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2043
2044 e_nobufs:
2045         return -ENOBUFS;
2046 e_inval:
2047         return -EINVAL;
2048 e_err:
2049         return err;
2050 }
2051
2052
2053 static void ip_handle_martian_source(struct net_device *dev,
2054                                      struct in_device *in_dev,
2055                                      struct sk_buff *skb,
2056                                      __be32 daddr,
2057                                      __be32 saddr)
2058 {
2059         RT_CACHE_STAT_INC(in_martian_src);
2060 #ifdef CONFIG_IP_ROUTE_VERBOSE
2061         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2062                 /*
2063                  *      RFC1812 recommendation, if source is martian,
2064                  *      the only hint is MAC header.
2065                  */
2066                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2067                         &daddr, &saddr, dev->name);
2068                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2069                         print_hex_dump(KERN_WARNING, "ll header: ",
2070                                        DUMP_PREFIX_OFFSET, 16, 1,
2071                                        skb_mac_header(skb),
2072                                        dev->hard_header_len, true);
2073                 }
2074         }
2075 #endif
2076 }
2077
2078 /* called in rcu_read_lock() section */
2079 static int __mkroute_input(struct sk_buff *skb,
2080                            const struct fib_result *res,
2081                            struct in_device *in_dev,
2082                            __be32 daddr, __be32 saddr, u32 tos,
2083                            struct rtable **result)
2084 {
2085         struct rtable *rth;
2086         int err;
2087         struct in_device *out_dev;
2088         unsigned int flags = 0;
2089         u32 itag;
2090
2091         /* get a working reference to the output device */
2092         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2093         if (out_dev == NULL) {
2094                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2095                 return -EINVAL;
2096         }
2097
2098
2099         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2100                                   in_dev->dev, in_dev, &itag);
2101         if (err < 0) {
2102                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2103                                          saddr);
2104
2105                 goto cleanup;
2106         }
2107
2108         if (err)
2109                 flags |= RTCF_DIRECTSRC;
2110
2111         if (out_dev == in_dev && err &&
2112             (IN_DEV_SHARED_MEDIA(out_dev) ||
2113              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2114                 flags |= RTCF_DOREDIRECT;
2115
2116         if (skb->protocol != htons(ETH_P_IP)) {
2117                 /* Not IP (i.e. ARP). Do not create route, if it is
2118                  * invalid for proxy arp. DNAT routes are always valid.
2119                  *
2120                  * Proxy arp feature have been extended to allow, ARP
2121                  * replies back to the same interface, to support
2122                  * Private VLAN switch technologies. See arp.c.
2123                  */
2124                 if (out_dev == in_dev &&
2125                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2126                         err = -EINVAL;
2127                         goto cleanup;
2128                 }
2129         }
2130
2131         rth = rt_dst_alloc(out_dev->dev,
2132                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2133                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2134         if (!rth) {
2135                 err = -ENOBUFS;
2136                 goto cleanup;
2137         }
2138
2139         rth->rt_key_dst = daddr;
2140         rth->rt_key_src = saddr;
2141         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2142         rth->rt_flags = flags;
2143         rth->rt_type = res->type;
2144         rth->rt_key_tos = tos;
2145         rth->rt_dst     = daddr;
2146         rth->rt_src     = saddr;
2147         rth->rt_route_iif = in_dev->dev->ifindex;
2148         rth->rt_iif     = in_dev->dev->ifindex;
2149         rth->rt_oif     = 0;
2150         rth->rt_mark    = skb->mark;
2151         rth->rt_gateway = daddr;
2152         rth->rt_peer_genid = 0;
2153         rt_init_peer(rth, &res->table->tb_peers);
2154         rth->fi = NULL;
2155
2156         rth->dst.input = ip_forward;
2157         rth->dst.output = ip_output;
2158
2159         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2160
2161         *result = rth;
2162         err = 0;
2163  cleanup:
2164         return err;
2165 }
2166
2167 static int ip_mkroute_input(struct sk_buff *skb,
2168                             struct fib_result *res,
2169                             const struct flowi4 *fl4,
2170                             struct in_device *in_dev,
2171                             __be32 daddr, __be32 saddr, u32 tos)
2172 {
2173         struct rtable *rth = NULL;
2174         int err;
2175         unsigned int hash;
2176
2177 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2178         if (res->fi && res->fi->fib_nhs > 1)
2179                 fib_select_multipath(res);
2180 #endif
2181
2182         /* create a routing cache entry */
2183         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2184         if (err)
2185                 return err;
2186
2187         /* put it into the cache */
2188         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2189                        rt_genid(dev_net(rth->dst.dev)));
2190         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2191         if (IS_ERR(rth))
2192                 return PTR_ERR(rth);
2193         return 0;
2194 }
2195
2196 /*
2197  *      NOTE. We drop all the packets that has local source
2198  *      addresses, because every properly looped back packet
2199  *      must have correct destination already attached by output routine.
2200  *
2201  *      Such approach solves two big problems:
2202  *      1. Not simplex devices are handled properly.
2203  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2204  *      called with rcu_read_lock()
2205  */
2206
2207 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2208                                u8 tos, struct net_device *dev)
2209 {
2210         struct fib_result res;
2211         struct in_device *in_dev = __in_dev_get_rcu(dev);
2212         struct flowi4   fl4;
2213         unsigned int    flags = 0;
2214         u32             itag = 0;
2215         struct rtable   *rth;
2216         unsigned int    hash;
2217         int             err = -EINVAL;
2218         struct net    *net = dev_net(dev);
2219
2220         /* IP on this device is disabled. */
2221
2222         if (!in_dev)
2223                 goto out;
2224
2225         /* Check for the most weird martians, which can be not detected
2226            by fib_lookup.
2227          */
2228
2229         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2230                 goto martian_source;
2231
2232         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2233                 goto brd_input;
2234
2235         /* Accept zero addresses only to limited broadcast;
2236          * I even do not know to fix it or not. Waiting for complains :-)
2237          */
2238         if (ipv4_is_zeronet(saddr))
2239                 goto martian_source;
2240
2241         if (ipv4_is_zeronet(daddr))
2242                 goto martian_destination;
2243
2244         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2245                 if (ipv4_is_loopback(daddr))
2246                         goto martian_destination;
2247
2248                 if (ipv4_is_loopback(saddr))
2249                         goto martian_source;
2250         }
2251
2252         /*
2253          *      Now we are ready to route packet.
2254          */
2255         fl4.flowi4_oif = 0;
2256         fl4.flowi4_iif = dev->ifindex;
2257         fl4.flowi4_mark = skb->mark;
2258         fl4.flowi4_tos = tos;
2259         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2260         fl4.daddr = daddr;
2261         fl4.saddr = saddr;
2262         err = fib_lookup(net, &fl4, &res);
2263         if (err != 0)
2264                 goto no_route;
2265
2266         RT_CACHE_STAT_INC(in_slow_tot);
2267
2268         if (res.type == RTN_BROADCAST)
2269                 goto brd_input;
2270
2271         if (res.type == RTN_LOCAL) {
2272                 err = fib_validate_source(skb, saddr, daddr, tos,
2273                                           net->loopback_dev->ifindex,
2274                                           dev, in_dev, &itag);
2275                 if (err < 0)
2276                         goto martian_source_keep_err;
2277                 if (err)
2278                         flags |= RTCF_DIRECTSRC;
2279                 goto local_input;
2280         }
2281
2282         if (!IN_DEV_FORWARD(in_dev))
2283                 goto no_route;
2284         if (res.type != RTN_UNICAST)
2285                 goto martian_destination;
2286
2287         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2288 out:    return err;
2289
2290 brd_input:
2291         if (skb->protocol != htons(ETH_P_IP))
2292                 goto e_inval;
2293
2294         if (!ipv4_is_zeronet(saddr)) {
2295                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2296                                           in_dev, &itag);
2297                 if (err < 0)
2298                         goto martian_source_keep_err;
2299                 if (err)
2300                         flags |= RTCF_DIRECTSRC;
2301         }
2302         flags |= RTCF_BROADCAST;
2303         res.type = RTN_BROADCAST;
2304         RT_CACHE_STAT_INC(in_brd);
2305
2306 local_input:
2307         rth = rt_dst_alloc(net->loopback_dev,
2308                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2309         if (!rth)
2310                 goto e_nobufs;
2311
2312         rth->dst.input= ip_local_deliver;
2313         rth->dst.output= ip_rt_bug;
2314 #ifdef CONFIG_IP_ROUTE_CLASSID
2315         rth->dst.tclassid = itag;
2316 #endif
2317
2318         rth->rt_key_dst = daddr;
2319         rth->rt_key_src = saddr;
2320         rth->rt_genid = rt_genid(net);
2321         rth->rt_flags   = flags|RTCF_LOCAL;
2322         rth->rt_type    = res.type;
2323         rth->rt_key_tos = tos;
2324         rth->rt_dst     = daddr;
2325         rth->rt_src     = saddr;
2326         rth->rt_route_iif = dev->ifindex;
2327         rth->rt_iif     = dev->ifindex;
2328         rth->rt_oif     = 0;
2329         rth->rt_mark    = skb->mark;
2330         rth->rt_gateway = daddr;
2331         rth->rt_peer_genid = 0;
2332         rt_init_peer(rth, net->ipv4.peers);
2333         rth->fi = NULL;
2334         if (res.type == RTN_UNREACHABLE) {
2335                 rth->dst.input= ip_error;
2336                 rth->dst.error= -err;
2337                 rth->rt_flags   &= ~RTCF_LOCAL;
2338         }
2339         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2340         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2341         err = 0;
2342         if (IS_ERR(rth))
2343                 err = PTR_ERR(rth);
2344         goto out;
2345
2346 no_route:
2347         RT_CACHE_STAT_INC(in_no_route);
2348         res.type = RTN_UNREACHABLE;
2349         if (err == -ESRCH)
2350                 err = -ENETUNREACH;
2351         goto local_input;
2352
2353         /*
2354          *      Do not cache martian addresses: they should be logged (RFC1812)
2355          */
2356 martian_destination:
2357         RT_CACHE_STAT_INC(in_martian_dst);
2358 #ifdef CONFIG_IP_ROUTE_VERBOSE
2359         if (IN_DEV_LOG_MARTIANS(in_dev))
2360                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2361                                      &daddr, &saddr, dev->name);
2362 #endif
2363
2364 e_inval:
2365         err = -EINVAL;
2366         goto out;
2367
2368 e_nobufs:
2369         err = -ENOBUFS;
2370         goto out;
2371
2372 martian_source:
2373         err = -EINVAL;
2374 martian_source_keep_err:
2375         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2376         goto out;
2377 }
2378
2379 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2380                            u8 tos, struct net_device *dev, bool noref)
2381 {
2382         struct rtable   *rth;
2383         unsigned int    hash;
2384         int iif = dev->ifindex;
2385         struct net *net;
2386         int res;
2387
2388         net = dev_net(dev);
2389
2390         rcu_read_lock();
2391
2392         if (!rt_caching(net))
2393                 goto skip_cache;
2394
2395         tos &= IPTOS_RT_MASK;
2396         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2397
2398         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2399              rth = rcu_dereference(rth->dst.rt_next)) {
2400                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2401                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2402                      (rth->rt_route_iif ^ iif) |
2403                      (rth->rt_key_tos ^ tos)) == 0 &&
2404                     rth->rt_mark == skb->mark &&
2405                     net_eq(dev_net(rth->dst.dev), net) &&
2406                     !rt_is_expired(rth)) {
2407                         ipv4_validate_peer(rth);
2408                         if (noref) {
2409                                 dst_use_noref(&rth->dst, jiffies);
2410                                 skb_dst_set_noref(skb, &rth->dst);
2411                         } else {
2412                                 dst_use(&rth->dst, jiffies);
2413                                 skb_dst_set(skb, &rth->dst);
2414                         }
2415                         RT_CACHE_STAT_INC(in_hit);
2416                         rcu_read_unlock();
2417                         return 0;
2418                 }
2419                 RT_CACHE_STAT_INC(in_hlist_search);
2420         }
2421
2422 skip_cache:
2423         /* Multicast recognition logic is moved from route cache to here.
2424            The problem was that too many Ethernet cards have broken/missing
2425            hardware multicast filters :-( As result the host on multicasting
2426            network acquires a lot of useless route cache entries, sort of
2427            SDR messages from all the world. Now we try to get rid of them.
2428            Really, provided software IP multicast filter is organized
2429            reasonably (at least, hashed), it does not result in a slowdown
2430            comparing with route cache reject entries.
2431            Note, that multicast routers are not affected, because
2432            route cache entry is created eventually.
2433          */
2434         if (ipv4_is_multicast(daddr)) {
2435                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2436
2437                 if (in_dev) {
2438                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2439                                                   ip_hdr(skb)->protocol);
2440                         if (our
2441 #ifdef CONFIG_IP_MROUTE
2442                                 ||
2443                             (!ipv4_is_local_multicast(daddr) &&
2444                              IN_DEV_MFORWARD(in_dev))
2445 #endif
2446                            ) {
2447                                 int res = ip_route_input_mc(skb, daddr, saddr,
2448                                                             tos, dev, our);
2449                                 rcu_read_unlock();
2450                                 return res;
2451                         }
2452                 }
2453                 rcu_read_unlock();
2454                 return -EINVAL;
2455         }
2456         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2457         rcu_read_unlock();
2458         return res;
2459 }
2460 EXPORT_SYMBOL(ip_route_input_common);
2461
2462 /* called with rcu_read_lock() */
2463 static struct rtable *__mkroute_output(const struct fib_result *res,
2464                                        const struct flowi4 *fl4,
2465                                        __be32 orig_daddr, __be32 orig_saddr,
2466                                        int orig_oif, __u8 orig_rtos,
2467                                        struct net_device *dev_out,
2468                                        unsigned int flags)
2469 {
2470         struct fib_info *fi = res->fi;
2471         struct in_device *in_dev;
2472         u16 type = res->type;
2473         struct rtable *rth;
2474
2475         in_dev = __in_dev_get_rcu(dev_out);
2476         if (!in_dev)
2477                 return ERR_PTR(-EINVAL);
2478
2479         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2480                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2481                         return ERR_PTR(-EINVAL);
2482
2483         if (ipv4_is_lbcast(fl4->daddr))
2484                 type = RTN_BROADCAST;
2485         else if (ipv4_is_multicast(fl4->daddr))
2486                 type = RTN_MULTICAST;
2487         else if (ipv4_is_zeronet(fl4->daddr))
2488                 return ERR_PTR(-EINVAL);
2489
2490         if (dev_out->flags & IFF_LOOPBACK)
2491                 flags |= RTCF_LOCAL;
2492
2493         if (type == RTN_BROADCAST) {
2494                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2495                 fi = NULL;
2496         } else if (type == RTN_MULTICAST) {
2497                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2498                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2499                                      fl4->flowi4_proto))
2500                         flags &= ~RTCF_LOCAL;
2501                 /* If multicast route do not exist use
2502                  * default one, but do not gateway in this case.
2503                  * Yes, it is hack.
2504                  */
2505                 if (fi && res->prefixlen < 4)
2506                         fi = NULL;
2507         }
2508
2509         rth = rt_dst_alloc(dev_out,
2510                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2511                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2512         if (!rth)
2513                 return ERR_PTR(-ENOBUFS);
2514
2515         rth->dst.output = ip_output;
2516
2517         rth->rt_key_dst = orig_daddr;
2518         rth->rt_key_src = orig_saddr;
2519         rth->rt_genid = rt_genid(dev_net(dev_out));
2520         rth->rt_flags   = flags;
2521         rth->rt_type    = type;
2522         rth->rt_key_tos = orig_rtos;
2523         rth->rt_dst     = fl4->daddr;
2524         rth->rt_src     = fl4->saddr;
2525         rth->rt_route_iif = 0;
2526         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2527         rth->rt_oif     = orig_oif;
2528         rth->rt_mark    = fl4->flowi4_mark;
2529         rth->rt_gateway = fl4->daddr;
2530         rth->rt_peer_genid = 0;
2531         rt_init_peer(rth, (res->table ?
2532                            &res->table->tb_peers :
2533                            dev_net(dev_out)->ipv4.peers));
2534         rth->fi = NULL;
2535
2536         RT_CACHE_STAT_INC(out_slow_tot);
2537
2538         if (flags & RTCF_LOCAL)
2539                 rth->dst.input = ip_local_deliver;
2540         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2541                 if (flags & RTCF_LOCAL &&
2542                     !(dev_out->flags & IFF_LOOPBACK)) {
2543                         rth->dst.output = ip_mc_output;
2544                         RT_CACHE_STAT_INC(out_slow_mc);
2545                 }
2546 #ifdef CONFIG_IP_MROUTE
2547                 if (type == RTN_MULTICAST) {
2548                         if (IN_DEV_MFORWARD(in_dev) &&
2549                             !ipv4_is_local_multicast(fl4->daddr)) {
2550                                 rth->dst.input = ip_mr_input;
2551                                 rth->dst.output = ip_mc_output;
2552                         }
2553                 }
2554 #endif
2555         }
2556
2557         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2558
2559         if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2560                 rth->dst.flags |= DST_NOCACHE;
2561
2562         return rth;
2563 }
2564
2565 /*
2566  * Major route resolver routine.
2567  * called with rcu_read_lock();
2568  */
2569
2570 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2571 {
2572         struct net_device *dev_out = NULL;
2573         __u8 tos = RT_FL_TOS(fl4);
2574         unsigned int flags = 0;
2575         struct fib_result res;
2576         struct rtable *rth;
2577         __be32 orig_daddr;
2578         __be32 orig_saddr;
2579         int orig_oif;
2580
2581         res.fi          = NULL;
2582         res.table       = NULL;
2583 #ifdef CONFIG_IP_MULTIPLE_TABLES
2584         res.r           = NULL;
2585 #endif
2586
2587         orig_daddr = fl4->daddr;
2588         orig_saddr = fl4->saddr;
2589         orig_oif = fl4->flowi4_oif;
2590
2591         fl4->flowi4_iif = net->loopback_dev->ifindex;
2592         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2593         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2594                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2595
2596         rcu_read_lock();
2597         if (fl4->saddr) {
2598                 rth = ERR_PTR(-EINVAL);
2599                 if (ipv4_is_multicast(fl4->saddr) ||
2600                     ipv4_is_lbcast(fl4->saddr) ||
2601                     ipv4_is_zeronet(fl4->saddr))
2602                         goto out;
2603
2604                 /* I removed check for oif == dev_out->oif here.
2605                    It was wrong for two reasons:
2606                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2607                       is assigned to multiple interfaces.
2608                    2. Moreover, we are allowed to send packets with saddr
2609                       of another iface. --ANK
2610                  */
2611
2612                 if (fl4->flowi4_oif == 0 &&
2613                     (ipv4_is_multicast(fl4->daddr) ||
2614                      ipv4_is_lbcast(fl4->daddr))) {
2615                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2616                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2617                         if (dev_out == NULL)
2618                                 goto out;
2619
2620                         /* Special hack: user can direct multicasts
2621                            and limited broadcast via necessary interface
2622                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2623                            This hack is not just for fun, it allows
2624                            vic,vat and friends to work.
2625                            They bind socket to loopback, set ttl to zero
2626                            and expect that it will work.
2627                            From the viewpoint of routing cache they are broken,
2628                            because we are not allowed to build multicast path
2629                            with loopback source addr (look, routing cache
2630                            cannot know, that ttl is zero, so that packet
2631                            will not leave this host and route is valid).
2632                            Luckily, this hack is good workaround.
2633                          */
2634
2635                         fl4->flowi4_oif = dev_out->ifindex;
2636                         goto make_route;
2637                 }
2638
2639                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2640                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2641                         if (!__ip_dev_find(net, fl4->saddr, false))
2642                                 goto out;
2643                 }
2644         }
2645
2646
2647         if (fl4->flowi4_oif) {
2648                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2649                 rth = ERR_PTR(-ENODEV);
2650                 if (dev_out == NULL)
2651                         goto out;
2652
2653                 /* RACE: Check return value of inet_select_addr instead. */
2654                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2655                         rth = ERR_PTR(-ENETUNREACH);
2656                         goto out;
2657                 }
2658                 if (ipv4_is_local_multicast(fl4->daddr) ||
2659                     ipv4_is_lbcast(fl4->daddr)) {
2660                         if (!fl4->saddr)
2661                                 fl4->saddr = inet_select_addr(dev_out, 0,
2662                                                               RT_SCOPE_LINK);
2663                         goto make_route;
2664                 }
2665                 if (fl4->saddr) {
2666                         if (ipv4_is_multicast(fl4->daddr))
2667                                 fl4->saddr = inet_select_addr(dev_out, 0,
2668                                                               fl4->flowi4_scope);
2669                         else if (!fl4->daddr)
2670                                 fl4->saddr = inet_select_addr(dev_out, 0,
2671                                                               RT_SCOPE_HOST);
2672                 }
2673         }
2674
2675         if (!fl4->daddr) {
2676                 fl4->daddr = fl4->saddr;
2677                 if (!fl4->daddr)
2678                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2679                 dev_out = net->loopback_dev;
2680                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2681                 res.type = RTN_LOCAL;
2682                 flags |= RTCF_LOCAL;
2683                 goto make_route;
2684         }
2685
2686         if (fib_lookup(net, fl4, &res)) {
2687                 res.fi = NULL;
2688                 res.table = NULL;
2689                 if (fl4->flowi4_oif) {
2690                         /* Apparently, routing tables are wrong. Assume,
2691                            that the destination is on link.
2692
2693                            WHY? DW.
2694                            Because we are allowed to send to iface
2695                            even if it has NO routes and NO assigned
2696                            addresses. When oif is specified, routing
2697                            tables are looked up with only one purpose:
2698                            to catch if destination is gatewayed, rather than
2699                            direct. Moreover, if MSG_DONTROUTE is set,
2700                            we send packet, ignoring both routing tables
2701                            and ifaddr state. --ANK
2702
2703
2704                            We could make it even if oif is unknown,
2705                            likely IPv6, but we do not.
2706                          */
2707
2708                         if (fl4->saddr == 0)
2709                                 fl4->saddr = inet_select_addr(dev_out, 0,
2710                                                               RT_SCOPE_LINK);
2711                         res.type = RTN_UNICAST;
2712                         goto make_route;
2713                 }
2714                 rth = ERR_PTR(-ENETUNREACH);
2715                 goto out;
2716         }
2717
2718         if (res.type == RTN_LOCAL) {
2719                 if (!fl4->saddr) {
2720                         if (res.fi->fib_prefsrc)
2721                                 fl4->saddr = res.fi->fib_prefsrc;
2722                         else
2723                                 fl4->saddr = fl4->daddr;
2724                 }
2725                 dev_out = net->loopback_dev;
2726                 fl4->flowi4_oif = dev_out->ifindex;
2727                 res.fi = NULL;
2728                 flags |= RTCF_LOCAL;
2729                 goto make_route;
2730         }
2731
2732 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2733         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2734                 fib_select_multipath(&res);
2735         else
2736 #endif
2737         if (!res.prefixlen &&
2738             res.table->tb_num_default > 1 &&
2739             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2740                 fib_select_default(&res);
2741
2742         if (!fl4->saddr)
2743                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2744
2745         dev_out = FIB_RES_DEV(res);
2746         fl4->flowi4_oif = dev_out->ifindex;
2747
2748
2749 make_route:
2750         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2751                                tos, dev_out, flags);
2752         if (!IS_ERR(rth)) {
2753                 unsigned int hash;
2754
2755                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2756                                rt_genid(dev_net(dev_out)));
2757                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2758         }
2759
2760 out:
2761         rcu_read_unlock();
2762         return rth;
2763 }
2764
2765 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2766 {
2767         struct rtable *rth;
2768         unsigned int hash;
2769
2770         if (!rt_caching(net))
2771                 goto slow_output;
2772
2773         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2774
2775         rcu_read_lock_bh();
2776         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2777                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2778                 if (rth->rt_key_dst == flp4->daddr &&
2779                     rth->rt_key_src == flp4->saddr &&
2780                     rt_is_output_route(rth) &&
2781                     rth->rt_oif == flp4->flowi4_oif &&
2782                     rth->rt_mark == flp4->flowi4_mark &&
2783                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2784                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2785                     net_eq(dev_net(rth->dst.dev), net) &&
2786                     !rt_is_expired(rth)) {
2787                         ipv4_validate_peer(rth);
2788                         dst_use(&rth->dst, jiffies);
2789                         RT_CACHE_STAT_INC(out_hit);
2790                         rcu_read_unlock_bh();
2791                         if (!flp4->saddr)
2792                                 flp4->saddr = rth->rt_src;
2793                         if (!flp4->daddr)
2794                                 flp4->daddr = rth->rt_dst;
2795                         return rth;
2796                 }
2797                 RT_CACHE_STAT_INC(out_hlist_search);
2798         }
2799         rcu_read_unlock_bh();
2800
2801 slow_output:
2802         return ip_route_output_slow(net, flp4);
2803 }
2804 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2805
2806 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2807 {
2808         return NULL;
2809 }
2810
2811 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2812 {
2813         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2814
2815         return mtu ? : dst->dev->mtu;
2816 }
2817
2818 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2819 {
2820 }
2821
2822 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2823                                           unsigned long old)
2824 {
2825         return NULL;
2826 }
2827
2828 static struct dst_ops ipv4_dst_blackhole_ops = {
2829         .family                 =       AF_INET,
2830         .protocol               =       cpu_to_be16(ETH_P_IP),
2831         .destroy                =       ipv4_dst_destroy,
2832         .check                  =       ipv4_blackhole_dst_check,
2833         .mtu                    =       ipv4_blackhole_mtu,
2834         .default_advmss         =       ipv4_default_advmss,
2835         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2836         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2837         .neigh_lookup           =       ipv4_neigh_lookup,
2838 };
2839
2840 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2841 {
2842         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2843         struct rtable *ort = (struct rtable *) dst_orig;
2844
2845         if (rt) {
2846                 struct dst_entry *new = &rt->dst;
2847
2848                 new->__use = 1;
2849                 new->input = dst_discard;
2850                 new->output = dst_discard;
2851                 dst_copy_metrics(new, &ort->dst);
2852
2853                 new->dev = ort->dst.dev;
2854                 if (new->dev)
2855                         dev_hold(new->dev);
2856
2857                 rt->rt_key_dst = ort->rt_key_dst;
2858                 rt->rt_key_src = ort->rt_key_src;
2859                 rt->rt_key_tos = ort->rt_key_tos;
2860                 rt->rt_route_iif = ort->rt_route_iif;
2861                 rt->rt_iif = ort->rt_iif;
2862                 rt->rt_oif = ort->rt_oif;
2863                 rt->rt_mark = ort->rt_mark;
2864
2865                 rt->rt_genid = rt_genid(net);
2866                 rt->rt_flags = ort->rt_flags;
2867                 rt->rt_type = ort->rt_type;
2868                 rt->rt_dst = ort->rt_dst;
2869                 rt->rt_src = ort->rt_src;
2870                 rt->rt_gateway = ort->rt_gateway;
2871                 rt_transfer_peer(rt, ort);
2872                 rt->fi = ort->fi;
2873                 if (rt->fi)
2874                         atomic_inc(&rt->fi->fib_clntref);
2875
2876                 dst_free(new);
2877         }
2878
2879         dst_release(dst_orig);
2880
2881         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2882 }
2883
2884 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2885                                     struct sock *sk)
2886 {
2887         struct rtable *rt = __ip_route_output_key(net, flp4);
2888
2889         if (IS_ERR(rt))
2890                 return rt;
2891
2892         if (flp4->flowi4_proto)
2893                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2894                                                    flowi4_to_flowi(flp4),
2895                                                    sk, 0);
2896
2897         return rt;
2898 }
2899 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2900
2901 static int rt_fill_info(struct net *net,
2902                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2903                         int nowait, unsigned int flags)
2904 {
2905         struct rtable *rt = skb_rtable(skb);
2906         struct rtmsg *r;
2907         struct nlmsghdr *nlh;
2908         unsigned long expires = 0;
2909         u32 id = 0, ts = 0, tsage = 0, error;
2910
2911         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2912         if (nlh == NULL)
2913                 return -EMSGSIZE;
2914
2915         r = nlmsg_data(nlh);
2916         r->rtm_family    = AF_INET;
2917         r->rtm_dst_len  = 32;
2918         r->rtm_src_len  = 0;
2919         r->rtm_tos      = rt->rt_key_tos;
2920         r->rtm_table    = RT_TABLE_MAIN;
2921         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2922                 goto nla_put_failure;
2923         r->rtm_type     = rt->rt_type;
2924         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2925         r->rtm_protocol = RTPROT_UNSPEC;
2926         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2927         if (rt->rt_flags & RTCF_NOTIFY)
2928                 r->rtm_flags |= RTM_F_NOTIFY;
2929
2930         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2931                 goto nla_put_failure;
2932         if (rt->rt_key_src) {
2933                 r->rtm_src_len = 32;
2934                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2935                         goto nla_put_failure;
2936         }
2937         if (rt->dst.dev &&
2938             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2939                 goto nla_put_failure;
2940 #ifdef CONFIG_IP_ROUTE_CLASSID
2941         if (rt->dst.tclassid &&
2942             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2943                 goto nla_put_failure;
2944 #endif
2945         if (!rt_is_input_route(rt) &&
2946             rt->rt_src != rt->rt_key_src) {
2947                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2948                         goto nla_put_failure;
2949         }
2950         if (rt->rt_dst != rt->rt_gateway &&
2951             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2952                 goto nla_put_failure;
2953
2954         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2955                 goto nla_put_failure;
2956
2957         if (rt->rt_mark &&
2958             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2959                 goto nla_put_failure;
2960
2961         error = rt->dst.error;
2962         if (rt_has_peer(rt)) {
2963                 const struct inet_peer *peer = rt_peer_ptr(rt);
2964                 inet_peer_refcheck(peer);
2965                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2966                 if (peer->tcp_ts_stamp) {
2967                         ts = peer->tcp_ts;
2968                         tsage = get_seconds() - peer->tcp_ts_stamp;
2969                 }
2970                 expires = ACCESS_ONCE(peer->pmtu_expires);
2971                 if (expires) {
2972                         if (time_before(jiffies, expires))
2973                                 expires -= jiffies;
2974                         else
2975                                 expires = 0;
2976                 }
2977         }
2978
2979         if (rt_is_input_route(rt)) {
2980 #ifdef CONFIG_IP_MROUTE
2981                 __be32 dst = rt->rt_dst;
2982
2983                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2984                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2985                         int err = ipmr_get_route(net, skb,
2986                                                  rt->rt_src, rt->rt_dst,
2987                                                  r, nowait);
2988                         if (err <= 0) {
2989                                 if (!nowait) {
2990                                         if (err == 0)
2991                                                 return 0;
2992                                         goto nla_put_failure;
2993                                 } else {
2994                                         if (err == -EMSGSIZE)
2995                                                 goto nla_put_failure;
2996                                         error = err;
2997                                 }
2998                         }
2999                 } else
3000 #endif
3001                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3002                                 goto nla_put_failure;
3003         }
3004
3005         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3006                                expires, error) < 0)
3007                 goto nla_put_failure;
3008
3009         return nlmsg_end(skb, nlh);
3010
3011 nla_put_failure:
3012         nlmsg_cancel(skb, nlh);
3013         return -EMSGSIZE;
3014 }
3015
3016 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3017 {
3018         struct net *net = sock_net(in_skb->sk);
3019         struct rtmsg *rtm;
3020         struct nlattr *tb[RTA_MAX+1];
3021         struct rtable *rt = NULL;
3022         __be32 dst = 0;
3023         __be32 src = 0;
3024         u32 iif;
3025         int err;
3026         int mark;
3027         struct sk_buff *skb;
3028
3029         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3030         if (err < 0)
3031                 goto errout;
3032
3033         rtm = nlmsg_data(nlh);
3034
3035         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3036         if (skb == NULL) {
3037                 err = -ENOBUFS;
3038                 goto errout;
3039         }
3040
3041         /* Reserve room for dummy headers, this skb can pass
3042            through good chunk of routing engine.
3043          */
3044         skb_reset_mac_header(skb);
3045         skb_reset_network_header(skb);
3046
3047         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3048         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3049         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3050
3051         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3052         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3053         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3054         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3055
3056         if (iif) {
3057                 struct net_device *dev;
3058
3059                 dev = __dev_get_by_index(net, iif);
3060                 if (dev == NULL) {
3061                         err = -ENODEV;
3062                         goto errout_free;
3063                 }
3064
3065                 skb->protocol   = htons(ETH_P_IP);
3066                 skb->dev        = dev;
3067                 skb->mark       = mark;
3068                 local_bh_disable();
3069                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3070                 local_bh_enable();
3071
3072                 rt = skb_rtable(skb);
3073                 if (err == 0 && rt->dst.error)
3074                         err = -rt->dst.error;
3075         } else {
3076                 struct flowi4 fl4 = {
3077                         .daddr = dst,
3078                         .saddr = src,
3079                         .flowi4_tos = rtm->rtm_tos,
3080                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3081                         .flowi4_mark = mark,
3082                 };
3083                 rt = ip_route_output_key(net, &fl4);
3084
3085                 err = 0;
3086                 if (IS_ERR(rt))
3087                         err = PTR_ERR(rt);
3088         }
3089
3090         if (err)
3091                 goto errout_free;
3092
3093         skb_dst_set(skb, &rt->dst);
3094         if (rtm->rtm_flags & RTM_F_NOTIFY)
3095                 rt->rt_flags |= RTCF_NOTIFY;
3096
3097         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3098                            RTM_NEWROUTE, 0, 0);
3099         if (err <= 0)
3100                 goto errout_free;
3101
3102         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3103 errout:
3104         return err;
3105
3106 errout_free:
3107         kfree_skb(skb);
3108         goto errout;
3109 }
3110
3111 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3112 {
3113         struct rtable *rt;
3114         int h, s_h;
3115         int idx, s_idx;
3116         struct net *net;
3117
3118         net = sock_net(skb->sk);
3119
3120         s_h = cb->args[0];
3121         if (s_h < 0)
3122                 s_h = 0;
3123         s_idx = idx = cb->args[1];
3124         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3125                 if (!rt_hash_table[h].chain)
3126                         continue;
3127                 rcu_read_lock_bh();
3128                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3129                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3130                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3131                                 continue;
3132                         if (rt_is_expired(rt))
3133                                 continue;
3134                         skb_dst_set_noref(skb, &rt->dst);
3135                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3136                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3137                                          1, NLM_F_MULTI) <= 0) {
3138                                 skb_dst_drop(skb);
3139                                 rcu_read_unlock_bh();
3140                                 goto done;
3141                         }
3142                         skb_dst_drop(skb);
3143                 }
3144                 rcu_read_unlock_bh();
3145         }
3146
3147 done:
3148         cb->args[0] = h;
3149         cb->args[1] = idx;
3150         return skb->len;
3151 }
3152
3153 void ip_rt_multicast_event(struct in_device *in_dev)
3154 {
3155         rt_cache_flush(dev_net(in_dev->dev), 0);
3156 }
3157
3158 #ifdef CONFIG_SYSCTL
3159 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3160                                         void __user *buffer,
3161                                         size_t *lenp, loff_t *ppos)
3162 {
3163         if (write) {
3164                 int flush_delay;
3165                 ctl_table ctl;
3166                 struct net *net;
3167
3168                 memcpy(&ctl, __ctl, sizeof(ctl));
3169                 ctl.data = &flush_delay;
3170                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3171
3172                 net = (struct net *)__ctl->extra1;
3173                 rt_cache_flush(net, flush_delay);
3174                 return 0;
3175         }
3176
3177         return -EINVAL;
3178 }
3179
3180 static ctl_table ipv4_route_table[] = {
3181         {
3182                 .procname       = "gc_thresh",
3183                 .data           = &ipv4_dst_ops.gc_thresh,
3184                 .maxlen         = sizeof(int),
3185                 .mode           = 0644,
3186                 .proc_handler   = proc_dointvec,
3187         },
3188         {
3189                 .procname       = "max_size",
3190                 .data           = &ip_rt_max_size,
3191                 .maxlen         = sizeof(int),
3192                 .mode           = 0644,
3193                 .proc_handler   = proc_dointvec,
3194         },
3195         {
3196                 /*  Deprecated. Use gc_min_interval_ms */
3197
3198                 .procname       = "gc_min_interval",
3199                 .data           = &ip_rt_gc_min_interval,
3200                 .maxlen         = sizeof(int),
3201                 .mode           = 0644,
3202                 .proc_handler   = proc_dointvec_jiffies,
3203         },
3204         {
3205                 .procname       = "gc_min_interval_ms",
3206                 .data           = &ip_rt_gc_min_interval,
3207                 .maxlen         = sizeof(int),
3208                 .mode           = 0644,
3209                 .proc_handler   = proc_dointvec_ms_jiffies,
3210         },
3211         {
3212                 .procname       = "gc_timeout",
3213                 .data           = &ip_rt_gc_timeout,
3214                 .maxlen         = sizeof(int),
3215                 .mode           = 0644,
3216                 .proc_handler   = proc_dointvec_jiffies,
3217         },
3218         {
3219                 .procname       = "gc_interval",
3220                 .data           = &ip_rt_gc_interval,
3221                 .maxlen         = sizeof(int),
3222                 .mode           = 0644,
3223                 .proc_handler   = proc_dointvec_jiffies,
3224         },
3225         {
3226                 .procname       = "redirect_load",
3227                 .data           = &ip_rt_redirect_load,
3228                 .maxlen         = sizeof(int),
3229                 .mode           = 0644,
3230                 .proc_handler   = proc_dointvec,
3231         },
3232         {
3233                 .procname       = "redirect_number",
3234                 .data           = &ip_rt_redirect_number,
3235                 .maxlen         = sizeof(int),
3236                 .mode           = 0644,
3237                 .proc_handler   = proc_dointvec,
3238         },
3239         {
3240                 .procname       = "redirect_silence",
3241                 .data           = &ip_rt_redirect_silence,
3242                 .maxlen         = sizeof(int),
3243                 .mode           = 0644,
3244                 .proc_handler   = proc_dointvec,
3245         },
3246         {
3247                 .procname       = "error_cost",
3248                 .data           = &ip_rt_error_cost,
3249                 .maxlen         = sizeof(int),
3250                 .mode           = 0644,
3251                 .proc_handler   = proc_dointvec,
3252         },
3253         {
3254                 .procname       = "error_burst",
3255                 .data           = &ip_rt_error_burst,
3256                 .maxlen         = sizeof(int),
3257                 .mode           = 0644,
3258                 .proc_handler   = proc_dointvec,
3259         },
3260         {
3261                 .procname       = "gc_elasticity",
3262                 .data           = &ip_rt_gc_elasticity,
3263                 .maxlen         = sizeof(int),
3264                 .mode           = 0644,
3265                 .proc_handler   = proc_dointvec,
3266         },
3267         {
3268                 .procname       = "mtu_expires",
3269                 .data           = &ip_rt_mtu_expires,
3270                 .maxlen         = sizeof(int),
3271                 .mode           = 0644,
3272                 .proc_handler   = proc_dointvec_jiffies,
3273         },
3274         {
3275                 .procname       = "min_pmtu",
3276                 .data           = &ip_rt_min_pmtu,
3277                 .maxlen         = sizeof(int),
3278                 .mode           = 0644,
3279                 .proc_handler   = proc_dointvec,
3280         },
3281         {
3282                 .procname       = "min_adv_mss",
3283                 .data           = &ip_rt_min_advmss,
3284                 .maxlen         = sizeof(int),
3285                 .mode           = 0644,
3286                 .proc_handler   = proc_dointvec,
3287         },
3288         { }
3289 };
3290
3291 static struct ctl_table ipv4_route_flush_table[] = {
3292         {
3293                 .procname       = "flush",
3294                 .maxlen         = sizeof(int),
3295                 .mode           = 0200,
3296                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3297         },
3298         { },
3299 };
3300
3301 static __net_init int sysctl_route_net_init(struct net *net)
3302 {
3303         struct ctl_table *tbl;
3304
3305         tbl = ipv4_route_flush_table;
3306         if (!net_eq(net, &init_net)) {
3307                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3308                 if (tbl == NULL)
3309                         goto err_dup;
3310         }
3311         tbl[0].extra1 = net;
3312
3313         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3314         if (net->ipv4.route_hdr == NULL)
3315                 goto err_reg;
3316         return 0;
3317
3318 err_reg:
3319         if (tbl != ipv4_route_flush_table)
3320                 kfree(tbl);
3321 err_dup:
3322         return -ENOMEM;
3323 }
3324
3325 static __net_exit void sysctl_route_net_exit(struct net *net)
3326 {
3327         struct ctl_table *tbl;
3328
3329         tbl = net->ipv4.route_hdr->ctl_table_arg;
3330         unregister_net_sysctl_table(net->ipv4.route_hdr);
3331         BUG_ON(tbl == ipv4_route_flush_table);
3332         kfree(tbl);
3333 }
3334
3335 static __net_initdata struct pernet_operations sysctl_route_ops = {
3336         .init = sysctl_route_net_init,
3337         .exit = sysctl_route_net_exit,
3338 };
3339 #endif
3340
3341 static __net_init int rt_genid_init(struct net *net)
3342 {
3343         get_random_bytes(&net->ipv4.rt_genid,
3344                          sizeof(net->ipv4.rt_genid));
3345         get_random_bytes(&net->ipv4.dev_addr_genid,
3346                          sizeof(net->ipv4.dev_addr_genid));
3347         return 0;
3348 }
3349
3350 static __net_initdata struct pernet_operations rt_genid_ops = {
3351         .init = rt_genid_init,
3352 };
3353
3354 static int __net_init ipv4_inetpeer_init(struct net *net)
3355 {
3356         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3357
3358         if (!bp)
3359                 return -ENOMEM;
3360         inet_peer_base_init(bp);
3361         net->ipv4.peers = bp;
3362         return 0;
3363 }
3364
3365 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3366 {
3367         struct inet_peer_base *bp = net->ipv4.peers;
3368
3369         net->ipv4.peers = NULL;
3370         inetpeer_invalidate_tree(bp);
3371         kfree(bp);
3372 }
3373
3374 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3375         .init   =       ipv4_inetpeer_init,
3376         .exit   =       ipv4_inetpeer_exit,
3377 };
3378
3379 #ifdef CONFIG_IP_ROUTE_CLASSID
3380 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3381 #endif /* CONFIG_IP_ROUTE_CLASSID */
3382
3383 static __initdata unsigned long rhash_entries;
3384 static int __init set_rhash_entries(char *str)
3385 {
3386         ssize_t ret;
3387
3388         if (!str)
3389                 return 0;
3390
3391         ret = kstrtoul(str, 0, &rhash_entries);
3392         if (ret)
3393                 return 0;
3394
3395         return 1;
3396 }
3397 __setup("rhash_entries=", set_rhash_entries);
3398
3399 int __init ip_rt_init(void)
3400 {
3401         int rc = 0;
3402
3403 #ifdef CONFIG_IP_ROUTE_CLASSID
3404         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3405         if (!ip_rt_acct)
3406                 panic("IP: failed to allocate ip_rt_acct\n");
3407 #endif
3408
3409         ipv4_dst_ops.kmem_cachep =
3410                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3411                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3412
3413         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3414
3415         if (dst_entries_init(&ipv4_dst_ops) < 0)
3416                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3417
3418         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3419                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3420
3421         rt_hash_table = (struct rt_hash_bucket *)
3422                 alloc_large_system_hash("IP route cache",
3423                                         sizeof(struct rt_hash_bucket),
3424                                         rhash_entries,
3425                                         (totalram_pages >= 128 * 1024) ?
3426                                         15 : 17,
3427                                         0,
3428                                         &rt_hash_log,
3429                                         &rt_hash_mask,
3430                                         0,
3431                                         rhash_entries ? 0 : 512 * 1024);
3432         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3433         rt_hash_lock_init();
3434
3435         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3436         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3437
3438         devinet_init();
3439         ip_fib_init();
3440
3441         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3442         expires_ljiffies = jiffies;
3443         schedule_delayed_work(&expires_work,
3444                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3445
3446         if (ip_rt_proc_init())
3447                 pr_err("Unable to create route proc files\n");
3448 #ifdef CONFIG_XFRM
3449         xfrm_init();
3450         xfrm4_init(ip_rt_max_size);
3451 #endif
3452         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3453
3454 #ifdef CONFIG_SYSCTL
3455         register_pernet_subsys(&sysctl_route_ops);
3456 #endif
3457         register_pernet_subsys(&rt_genid_ops);
3458         register_pernet_subsys(&ipv4_inetpeer_ops);
3459         return rc;
3460 }
3461
3462 #ifdef CONFIG_SYSCTL
3463 /*
3464  * We really need to sanitize the damn ipv4 init order, then all
3465  * this nonsense will go away.
3466  */
3467 void __init ip_static_sysctl_init(void)
3468 {
3469         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3470 }
3471 #endif