]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv4/route.c
inet: Use FIB table peer roots in routes.
[karo-tx-linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
113 #endif
114 #include <net/secure_seq.h>
115
116 #define RT_FL_TOS(oldflp4) \
117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define IP_MAX_MTU      0xFFF0
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
127 static int ip_rt_redirect_number __read_mostly  = 9;
128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly       = HZ;
131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly    = 8;
133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly       = 256;
136 static int rt_chain_length_max __read_mostly    = 20;
137
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
148 static void              ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153
154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
155                             int how)
156 {
157 }
158
159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160 {
161         struct rtable *rt = (struct rtable *) dst;
162         struct inet_peer *peer;
163         u32 *p = NULL;
164
165         peer = rt_get_peer_create(rt, rt->rt_dst);
166         if (peer) {
167                 u32 *old_p = __DST_METRICS_PTR(old);
168                 unsigned long prev, new;
169
170                 p = peer->metrics;
171                 if (inet_metrics_new(peer))
172                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
173
174                 new = (unsigned long) p;
175                 prev = cmpxchg(&dst->_metrics, old, new);
176
177                 if (prev != old) {
178                         p = __DST_METRICS_PTR(prev);
179                         if (prev & DST_METRICS_READ_ONLY)
180                                 p = NULL;
181                 } else {
182                         if (rt->fi) {
183                                 fib_info_put(rt->fi);
184                                 rt->fi = NULL;
185                         }
186                 }
187         }
188         return p;
189 }
190
191 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
192
193 static struct dst_ops ipv4_dst_ops = {
194         .family =               AF_INET,
195         .protocol =             cpu_to_be16(ETH_P_IP),
196         .gc =                   rt_garbage_collect,
197         .check =                ipv4_dst_check,
198         .default_advmss =       ipv4_default_advmss,
199         .mtu =                  ipv4_mtu,
200         .cow_metrics =          ipv4_cow_metrics,
201         .destroy =              ipv4_dst_destroy,
202         .ifdown =               ipv4_dst_ifdown,
203         .negative_advice =      ipv4_negative_advice,
204         .link_failure =         ipv4_link_failure,
205         .update_pmtu =          ip_rt_update_pmtu,
206         .local_out =            __ip_local_out,
207         .neigh_lookup =         ipv4_neigh_lookup,
208 };
209
210 #define ECN_OR_COST(class)      TC_PRIO_##class
211
212 const __u8 ip_tos2prio[16] = {
213         TC_PRIO_BESTEFFORT,
214         ECN_OR_COST(BESTEFFORT),
215         TC_PRIO_BESTEFFORT,
216         ECN_OR_COST(BESTEFFORT),
217         TC_PRIO_BULK,
218         ECN_OR_COST(BULK),
219         TC_PRIO_BULK,
220         ECN_OR_COST(BULK),
221         TC_PRIO_INTERACTIVE,
222         ECN_OR_COST(INTERACTIVE),
223         TC_PRIO_INTERACTIVE,
224         ECN_OR_COST(INTERACTIVE),
225         TC_PRIO_INTERACTIVE_BULK,
226         ECN_OR_COST(INTERACTIVE_BULK),
227         TC_PRIO_INTERACTIVE_BULK,
228         ECN_OR_COST(INTERACTIVE_BULK)
229 };
230 EXPORT_SYMBOL(ip_tos2prio);
231
232 /*
233  * Route cache.
234  */
235
236 /* The locking scheme is rather straight forward:
237  *
238  * 1) Read-Copy Update protects the buckets of the central route hash.
239  * 2) Only writers remove entries, and they hold the lock
240  *    as they look at rtable reference counts.
241  * 3) Only readers acquire references to rtable entries,
242  *    they do so with atomic increments and with the
243  *    lock held.
244  */
245
246 struct rt_hash_bucket {
247         struct rtable __rcu     *chain;
248 };
249
250 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
251         defined(CONFIG_PROVE_LOCKING)
252 /*
253  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
254  * The size of this table is a power of two and depends on the number of CPUS.
255  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
256  */
257 #ifdef CONFIG_LOCKDEP
258 # define RT_HASH_LOCK_SZ        256
259 #else
260 # if NR_CPUS >= 32
261 #  define RT_HASH_LOCK_SZ       4096
262 # elif NR_CPUS >= 16
263 #  define RT_HASH_LOCK_SZ       2048
264 # elif NR_CPUS >= 8
265 #  define RT_HASH_LOCK_SZ       1024
266 # elif NR_CPUS >= 4
267 #  define RT_HASH_LOCK_SZ       512
268 # else
269 #  define RT_HASH_LOCK_SZ       256
270 # endif
271 #endif
272
273 static spinlock_t       *rt_hash_locks;
274 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
275
276 static __init void rt_hash_lock_init(void)
277 {
278         int i;
279
280         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
281                         GFP_KERNEL);
282         if (!rt_hash_locks)
283                 panic("IP: failed to allocate rt_hash_locks\n");
284
285         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
286                 spin_lock_init(&rt_hash_locks[i]);
287 }
288 #else
289 # define rt_hash_lock_addr(slot) NULL
290
291 static inline void rt_hash_lock_init(void)
292 {
293 }
294 #endif
295
296 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
297 static unsigned int             rt_hash_mask __read_mostly;
298 static unsigned int             rt_hash_log  __read_mostly;
299
300 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
301 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
302
303 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
304                                    int genid)
305 {
306         return jhash_3words((__force u32)daddr, (__force u32)saddr,
307                             idx, genid)
308                 & rt_hash_mask;
309 }
310
311 static inline int rt_genid(struct net *net)
312 {
313         return atomic_read(&net->ipv4.rt_genid);
314 }
315
316 #ifdef CONFIG_PROC_FS
317 struct rt_cache_iter_state {
318         struct seq_net_private p;
319         int bucket;
320         int genid;
321 };
322
323 static struct rtable *rt_cache_get_first(struct seq_file *seq)
324 {
325         struct rt_cache_iter_state *st = seq->private;
326         struct rtable *r = NULL;
327
328         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
329                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
330                         continue;
331                 rcu_read_lock_bh();
332                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
333                 while (r) {
334                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
335                             r->rt_genid == st->genid)
336                                 return r;
337                         r = rcu_dereference_bh(r->dst.rt_next);
338                 }
339                 rcu_read_unlock_bh();
340         }
341         return r;
342 }
343
344 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
345                                           struct rtable *r)
346 {
347         struct rt_cache_iter_state *st = seq->private;
348
349         r = rcu_dereference_bh(r->dst.rt_next);
350         while (!r) {
351                 rcu_read_unlock_bh();
352                 do {
353                         if (--st->bucket < 0)
354                                 return NULL;
355                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
356                 rcu_read_lock_bh();
357                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
358         }
359         return r;
360 }
361
362 static struct rtable *rt_cache_get_next(struct seq_file *seq,
363                                         struct rtable *r)
364 {
365         struct rt_cache_iter_state *st = seq->private;
366         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
367                 if (dev_net(r->dst.dev) != seq_file_net(seq))
368                         continue;
369                 if (r->rt_genid == st->genid)
370                         break;
371         }
372         return r;
373 }
374
375 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
376 {
377         struct rtable *r = rt_cache_get_first(seq);
378
379         if (r)
380                 while (pos && (r = rt_cache_get_next(seq, r)))
381                         --pos;
382         return pos ? NULL : r;
383 }
384
385 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
386 {
387         struct rt_cache_iter_state *st = seq->private;
388         if (*pos)
389                 return rt_cache_get_idx(seq, *pos - 1);
390         st->genid = rt_genid(seq_file_net(seq));
391         return SEQ_START_TOKEN;
392 }
393
394 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
395 {
396         struct rtable *r;
397
398         if (v == SEQ_START_TOKEN)
399                 r = rt_cache_get_first(seq);
400         else
401                 r = rt_cache_get_next(seq, v);
402         ++*pos;
403         return r;
404 }
405
406 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
407 {
408         if (v && v != SEQ_START_TOKEN)
409                 rcu_read_unlock_bh();
410 }
411
412 static int rt_cache_seq_show(struct seq_file *seq, void *v)
413 {
414         if (v == SEQ_START_TOKEN)
415                 seq_printf(seq, "%-127s\n",
416                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
417                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
418                            "HHUptod\tSpecDst");
419         else {
420                 struct rtable *r = v;
421                 struct neighbour *n;
422                 int len, HHUptod;
423
424                 rcu_read_lock();
425                 n = dst_get_neighbour_noref(&r->dst);
426                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
427                 rcu_read_unlock();
428
429                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
430                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
431                         r->dst.dev ? r->dst.dev->name : "*",
432                         (__force u32)r->rt_dst,
433                         (__force u32)r->rt_gateway,
434                         r->rt_flags, atomic_read(&r->dst.__refcnt),
435                         r->dst.__use, 0, (__force u32)r->rt_src,
436                         dst_metric_advmss(&r->dst) + 40,
437                         dst_metric(&r->dst, RTAX_WINDOW),
438                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
439                               dst_metric(&r->dst, RTAX_RTTVAR)),
440                         r->rt_key_tos,
441                         -1,
442                         HHUptod,
443                         r->rt_spec_dst, &len);
444
445                 seq_printf(seq, "%*s\n", 127 - len, "");
446         }
447         return 0;
448 }
449
450 static const struct seq_operations rt_cache_seq_ops = {
451         .start  = rt_cache_seq_start,
452         .next   = rt_cache_seq_next,
453         .stop   = rt_cache_seq_stop,
454         .show   = rt_cache_seq_show,
455 };
456
457 static int rt_cache_seq_open(struct inode *inode, struct file *file)
458 {
459         return seq_open_net(inode, file, &rt_cache_seq_ops,
460                         sizeof(struct rt_cache_iter_state));
461 }
462
463 static const struct file_operations rt_cache_seq_fops = {
464         .owner   = THIS_MODULE,
465         .open    = rt_cache_seq_open,
466         .read    = seq_read,
467         .llseek  = seq_lseek,
468         .release = seq_release_net,
469 };
470
471
472 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
473 {
474         int cpu;
475
476         if (*pos == 0)
477                 return SEQ_START_TOKEN;
478
479         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
480                 if (!cpu_possible(cpu))
481                         continue;
482                 *pos = cpu+1;
483                 return &per_cpu(rt_cache_stat, cpu);
484         }
485         return NULL;
486 }
487
488 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
489 {
490         int cpu;
491
492         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
493                 if (!cpu_possible(cpu))
494                         continue;
495                 *pos = cpu+1;
496                 return &per_cpu(rt_cache_stat, cpu);
497         }
498         return NULL;
499
500 }
501
502 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
503 {
504
505 }
506
507 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
508 {
509         struct rt_cache_stat *st = v;
510
511         if (v == SEQ_START_TOKEN) {
512                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
513                 return 0;
514         }
515
516         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
517                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
518                    dst_entries_get_slow(&ipv4_dst_ops),
519                    st->in_hit,
520                    st->in_slow_tot,
521                    st->in_slow_mc,
522                    st->in_no_route,
523                    st->in_brd,
524                    st->in_martian_dst,
525                    st->in_martian_src,
526
527                    st->out_hit,
528                    st->out_slow_tot,
529                    st->out_slow_mc,
530
531                    st->gc_total,
532                    st->gc_ignored,
533                    st->gc_goal_miss,
534                    st->gc_dst_overflow,
535                    st->in_hlist_search,
536                    st->out_hlist_search
537                 );
538         return 0;
539 }
540
541 static const struct seq_operations rt_cpu_seq_ops = {
542         .start  = rt_cpu_seq_start,
543         .next   = rt_cpu_seq_next,
544         .stop   = rt_cpu_seq_stop,
545         .show   = rt_cpu_seq_show,
546 };
547
548
549 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
550 {
551         return seq_open(file, &rt_cpu_seq_ops);
552 }
553
554 static const struct file_operations rt_cpu_seq_fops = {
555         .owner   = THIS_MODULE,
556         .open    = rt_cpu_seq_open,
557         .read    = seq_read,
558         .llseek  = seq_lseek,
559         .release = seq_release,
560 };
561
562 #ifdef CONFIG_IP_ROUTE_CLASSID
563 static int rt_acct_proc_show(struct seq_file *m, void *v)
564 {
565         struct ip_rt_acct *dst, *src;
566         unsigned int i, j;
567
568         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
569         if (!dst)
570                 return -ENOMEM;
571
572         for_each_possible_cpu(i) {
573                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
574                 for (j = 0; j < 256; j++) {
575                         dst[j].o_bytes   += src[j].o_bytes;
576                         dst[j].o_packets += src[j].o_packets;
577                         dst[j].i_bytes   += src[j].i_bytes;
578                         dst[j].i_packets += src[j].i_packets;
579                 }
580         }
581
582         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
583         kfree(dst);
584         return 0;
585 }
586
587 static int rt_acct_proc_open(struct inode *inode, struct file *file)
588 {
589         return single_open(file, rt_acct_proc_show, NULL);
590 }
591
592 static const struct file_operations rt_acct_proc_fops = {
593         .owner          = THIS_MODULE,
594         .open           = rt_acct_proc_open,
595         .read           = seq_read,
596         .llseek         = seq_lseek,
597         .release        = single_release,
598 };
599 #endif
600
601 static int __net_init ip_rt_do_proc_init(struct net *net)
602 {
603         struct proc_dir_entry *pde;
604
605         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
606                         &rt_cache_seq_fops);
607         if (!pde)
608                 goto err1;
609
610         pde = proc_create("rt_cache", S_IRUGO,
611                           net->proc_net_stat, &rt_cpu_seq_fops);
612         if (!pde)
613                 goto err2;
614
615 #ifdef CONFIG_IP_ROUTE_CLASSID
616         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
617         if (!pde)
618                 goto err3;
619 #endif
620         return 0;
621
622 #ifdef CONFIG_IP_ROUTE_CLASSID
623 err3:
624         remove_proc_entry("rt_cache", net->proc_net_stat);
625 #endif
626 err2:
627         remove_proc_entry("rt_cache", net->proc_net);
628 err1:
629         return -ENOMEM;
630 }
631
632 static void __net_exit ip_rt_do_proc_exit(struct net *net)
633 {
634         remove_proc_entry("rt_cache", net->proc_net_stat);
635         remove_proc_entry("rt_cache", net->proc_net);
636 #ifdef CONFIG_IP_ROUTE_CLASSID
637         remove_proc_entry("rt_acct", net->proc_net);
638 #endif
639 }
640
641 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
642         .init = ip_rt_do_proc_init,
643         .exit = ip_rt_do_proc_exit,
644 };
645
646 static int __init ip_rt_proc_init(void)
647 {
648         return register_pernet_subsys(&ip_rt_proc_ops);
649 }
650
651 #else
652 static inline int ip_rt_proc_init(void)
653 {
654         return 0;
655 }
656 #endif /* CONFIG_PROC_FS */
657
658 static inline void rt_free(struct rtable *rt)
659 {
660         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
661 }
662
663 static inline void rt_drop(struct rtable *rt)
664 {
665         ip_rt_put(rt);
666         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
667 }
668
669 static inline int rt_fast_clean(struct rtable *rth)
670 {
671         /* Kill broadcast/multicast entries very aggresively, if they
672            collide in hash table with more useful entries */
673         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
674                 rt_is_input_route(rth) && rth->dst.rt_next;
675 }
676
677 static inline int rt_valuable(struct rtable *rth)
678 {
679         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
680                 (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
681 }
682
683 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
684 {
685         unsigned long age;
686         int ret = 0;
687
688         if (atomic_read(&rth->dst.__refcnt))
689                 goto out;
690
691         age = jiffies - rth->dst.lastuse;
692         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
693             (age <= tmo2 && rt_valuable(rth)))
694                 goto out;
695         ret = 1;
696 out:    return ret;
697 }
698
699 /* Bits of score are:
700  * 31: very valuable
701  * 30: not quite useless
702  * 29..0: usage counter
703  */
704 static inline u32 rt_score(struct rtable *rt)
705 {
706         u32 score = jiffies - rt->dst.lastuse;
707
708         score = ~score & ~(3<<30);
709
710         if (rt_valuable(rt))
711                 score |= (1<<31);
712
713         if (rt_is_output_route(rt) ||
714             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
715                 score |= (1<<30);
716
717         return score;
718 }
719
720 static inline bool rt_caching(const struct net *net)
721 {
722         return net->ipv4.current_rt_cache_rebuild_count <=
723                 net->ipv4.sysctl_rt_cache_rebuild_count;
724 }
725
726 static inline bool compare_hash_inputs(const struct rtable *rt1,
727                                        const struct rtable *rt2)
728 {
729         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
730                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
731                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
732 }
733
734 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
735 {
736         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
737                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
738                 (rt1->rt_mark ^ rt2->rt_mark) |
739                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
740                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
741                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
742 }
743
744 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
745 {
746         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
747 }
748
749 static inline int rt_is_expired(struct rtable *rth)
750 {
751         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
752 }
753
754 /*
755  * Perform a full scan of hash table and free all entries.
756  * Can be called by a softirq or a process.
757  * In the later case, we want to be reschedule if necessary
758  */
759 static void rt_do_flush(struct net *net, int process_context)
760 {
761         unsigned int i;
762         struct rtable *rth, *next;
763
764         for (i = 0; i <= rt_hash_mask; i++) {
765                 struct rtable __rcu **pprev;
766                 struct rtable *list;
767
768                 if (process_context && need_resched())
769                         cond_resched();
770                 rth = rcu_access_pointer(rt_hash_table[i].chain);
771                 if (!rth)
772                         continue;
773
774                 spin_lock_bh(rt_hash_lock_addr(i));
775
776                 list = NULL;
777                 pprev = &rt_hash_table[i].chain;
778                 rth = rcu_dereference_protected(*pprev,
779                         lockdep_is_held(rt_hash_lock_addr(i)));
780
781                 while (rth) {
782                         next = rcu_dereference_protected(rth->dst.rt_next,
783                                 lockdep_is_held(rt_hash_lock_addr(i)));
784
785                         if (!net ||
786                             net_eq(dev_net(rth->dst.dev), net)) {
787                                 rcu_assign_pointer(*pprev, next);
788                                 rcu_assign_pointer(rth->dst.rt_next, list);
789                                 list = rth;
790                         } else {
791                                 pprev = &rth->dst.rt_next;
792                         }
793                         rth = next;
794                 }
795
796                 spin_unlock_bh(rt_hash_lock_addr(i));
797
798                 for (; list; list = next) {
799                         next = rcu_dereference_protected(list->dst.rt_next, 1);
800                         rt_free(list);
801                 }
802         }
803 }
804
805 /*
806  * While freeing expired entries, we compute average chain length
807  * and standard deviation, using fixed-point arithmetic.
808  * This to have an estimation of rt_chain_length_max
809  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
810  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
811  */
812
813 #define FRACT_BITS 3
814 #define ONE (1UL << FRACT_BITS)
815
816 /*
817  * Given a hash chain and an item in this hash chain,
818  * find if a previous entry has the same hash_inputs
819  * (but differs on tos, mark or oif)
820  * Returns 0 if an alias is found.
821  * Returns ONE if rth has no alias before itself.
822  */
823 static int has_noalias(const struct rtable *head, const struct rtable *rth)
824 {
825         const struct rtable *aux = head;
826
827         while (aux != rth) {
828                 if (compare_hash_inputs(aux, rth))
829                         return 0;
830                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
831         }
832         return ONE;
833 }
834
835 static void rt_check_expire(void)
836 {
837         static unsigned int rover;
838         unsigned int i = rover, goal;
839         struct rtable *rth;
840         struct rtable __rcu **rthp;
841         unsigned long samples = 0;
842         unsigned long sum = 0, sum2 = 0;
843         unsigned long delta;
844         u64 mult;
845
846         delta = jiffies - expires_ljiffies;
847         expires_ljiffies = jiffies;
848         mult = ((u64)delta) << rt_hash_log;
849         if (ip_rt_gc_timeout > 1)
850                 do_div(mult, ip_rt_gc_timeout);
851         goal = (unsigned int)mult;
852         if (goal > rt_hash_mask)
853                 goal = rt_hash_mask + 1;
854         for (; goal > 0; goal--) {
855                 unsigned long tmo = ip_rt_gc_timeout;
856                 unsigned long length;
857
858                 i = (i + 1) & rt_hash_mask;
859                 rthp = &rt_hash_table[i].chain;
860
861                 if (need_resched())
862                         cond_resched();
863
864                 samples++;
865
866                 if (rcu_dereference_raw(*rthp) == NULL)
867                         continue;
868                 length = 0;
869                 spin_lock_bh(rt_hash_lock_addr(i));
870                 while ((rth = rcu_dereference_protected(*rthp,
871                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
872                         prefetch(rth->dst.rt_next);
873                         if (rt_is_expired(rth)) {
874                                 *rthp = rth->dst.rt_next;
875                                 rt_free(rth);
876                                 continue;
877                         }
878                         if (rth->dst.expires) {
879                                 /* Entry is expired even if it is in use */
880                                 if (time_before_eq(jiffies, rth->dst.expires)) {
881 nofree:
882                                         tmo >>= 1;
883                                         rthp = &rth->dst.rt_next;
884                                         /*
885                                          * We only count entries on
886                                          * a chain with equal hash inputs once
887                                          * so that entries for different QOS
888                                          * levels, and other non-hash input
889                                          * attributes don't unfairly skew
890                                          * the length computation
891                                          */
892                                         length += has_noalias(rt_hash_table[i].chain, rth);
893                                         continue;
894                                 }
895                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
896                                 goto nofree;
897
898                         /* Cleanup aged off entries. */
899                         *rthp = rth->dst.rt_next;
900                         rt_free(rth);
901                 }
902                 spin_unlock_bh(rt_hash_lock_addr(i));
903                 sum += length;
904                 sum2 += length*length;
905         }
906         if (samples) {
907                 unsigned long avg = sum / samples;
908                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
909                 rt_chain_length_max = max_t(unsigned long,
910                                         ip_rt_gc_elasticity,
911                                         (avg + 4*sd) >> FRACT_BITS);
912         }
913         rover = i;
914 }
915
916 /*
917  * rt_worker_func() is run in process context.
918  * we call rt_check_expire() to scan part of the hash table
919  */
920 static void rt_worker_func(struct work_struct *work)
921 {
922         rt_check_expire();
923         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
924 }
925
926 /*
927  * Perturbation of rt_genid by a small quantity [1..256]
928  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
929  * many times (2^24) without giving recent rt_genid.
930  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
931  */
932 static void rt_cache_invalidate(struct net *net)
933 {
934         unsigned char shuffle;
935
936         get_random_bytes(&shuffle, sizeof(shuffle));
937         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
938         inetpeer_invalidate_family(AF_INET);
939 }
940
941 /*
942  * delay < 0  : invalidate cache (fast : entries will be deleted later)
943  * delay >= 0 : invalidate & flush cache (can be long)
944  */
945 void rt_cache_flush(struct net *net, int delay)
946 {
947         rt_cache_invalidate(net);
948         if (delay >= 0)
949                 rt_do_flush(net, !in_softirq());
950 }
951
952 /* Flush previous cache invalidated entries from the cache */
953 void rt_cache_flush_batch(struct net *net)
954 {
955         rt_do_flush(net, !in_softirq());
956 }
957
958 static void rt_emergency_hash_rebuild(struct net *net)
959 {
960         net_warn_ratelimited("Route hash chain too long!\n");
961         rt_cache_invalidate(net);
962 }
963
964 /*
965    Short description of GC goals.
966
967    We want to build algorithm, which will keep routing cache
968    at some equilibrium point, when number of aged off entries
969    is kept approximately equal to newly generated ones.
970
971    Current expiration strength is variable "expire".
972    We try to adjust it dynamically, so that if networking
973    is idle expires is large enough to keep enough of warm entries,
974    and when load increases it reduces to limit cache size.
975  */
976
977 static int rt_garbage_collect(struct dst_ops *ops)
978 {
979         static unsigned long expire = RT_GC_TIMEOUT;
980         static unsigned long last_gc;
981         static int rover;
982         static int equilibrium;
983         struct rtable *rth;
984         struct rtable __rcu **rthp;
985         unsigned long now = jiffies;
986         int goal;
987         int entries = dst_entries_get_fast(&ipv4_dst_ops);
988
989         /*
990          * Garbage collection is pretty expensive,
991          * do not make it too frequently.
992          */
993
994         RT_CACHE_STAT_INC(gc_total);
995
996         if (now - last_gc < ip_rt_gc_min_interval &&
997             entries < ip_rt_max_size) {
998                 RT_CACHE_STAT_INC(gc_ignored);
999                 goto out;
1000         }
1001
1002         entries = dst_entries_get_slow(&ipv4_dst_ops);
1003         /* Calculate number of entries, which we want to expire now. */
1004         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1005         if (goal <= 0) {
1006                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1007                         equilibrium = ipv4_dst_ops.gc_thresh;
1008                 goal = entries - equilibrium;
1009                 if (goal > 0) {
1010                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1011                         goal = entries - equilibrium;
1012                 }
1013         } else {
1014                 /* We are in dangerous area. Try to reduce cache really
1015                  * aggressively.
1016                  */
1017                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1018                 equilibrium = entries - goal;
1019         }
1020
1021         if (now - last_gc >= ip_rt_gc_min_interval)
1022                 last_gc = now;
1023
1024         if (goal <= 0) {
1025                 equilibrium += goal;
1026                 goto work_done;
1027         }
1028
1029         do {
1030                 int i, k;
1031
1032                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1033                         unsigned long tmo = expire;
1034
1035                         k = (k + 1) & rt_hash_mask;
1036                         rthp = &rt_hash_table[k].chain;
1037                         spin_lock_bh(rt_hash_lock_addr(k));
1038                         while ((rth = rcu_dereference_protected(*rthp,
1039                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1040                                 if (!rt_is_expired(rth) &&
1041                                         !rt_may_expire(rth, tmo, expire)) {
1042                                         tmo >>= 1;
1043                                         rthp = &rth->dst.rt_next;
1044                                         continue;
1045                                 }
1046                                 *rthp = rth->dst.rt_next;
1047                                 rt_free(rth);
1048                                 goal--;
1049                         }
1050                         spin_unlock_bh(rt_hash_lock_addr(k));
1051                         if (goal <= 0)
1052                                 break;
1053                 }
1054                 rover = k;
1055
1056                 if (goal <= 0)
1057                         goto work_done;
1058
1059                 /* Goal is not achieved. We stop process if:
1060
1061                    - if expire reduced to zero. Otherwise, expire is halfed.
1062                    - if table is not full.
1063                    - if we are called from interrupt.
1064                    - jiffies check is just fallback/debug loop breaker.
1065                      We will not spin here for long time in any case.
1066                  */
1067
1068                 RT_CACHE_STAT_INC(gc_goal_miss);
1069
1070                 if (expire == 0)
1071                         break;
1072
1073                 expire >>= 1;
1074
1075                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1076                         goto out;
1077         } while (!in_softirq() && time_before_eq(jiffies, now));
1078
1079         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1080                 goto out;
1081         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1082                 goto out;
1083         net_warn_ratelimited("dst cache overflow\n");
1084         RT_CACHE_STAT_INC(gc_dst_overflow);
1085         return 1;
1086
1087 work_done:
1088         expire += ip_rt_gc_min_interval;
1089         if (expire > ip_rt_gc_timeout ||
1090             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1091             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1092                 expire = ip_rt_gc_timeout;
1093 out:    return 0;
1094 }
1095
1096 /*
1097  * Returns number of entries in a hash chain that have different hash_inputs
1098  */
1099 static int slow_chain_length(const struct rtable *head)
1100 {
1101         int length = 0;
1102         const struct rtable *rth = head;
1103
1104         while (rth) {
1105                 length += has_noalias(head, rth);
1106                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1107         }
1108         return length >> FRACT_BITS;
1109 }
1110
1111 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1112 {
1113         static const __be32 inaddr_any = 0;
1114         struct net_device *dev = dst->dev;
1115         const __be32 *pkey = daddr;
1116         const struct rtable *rt;
1117         struct neighbour *n;
1118
1119         rt = (const struct rtable *) dst;
1120
1121         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1122                 pkey = &inaddr_any;
1123         else if (rt->rt_gateway)
1124                 pkey = (const __be32 *) &rt->rt_gateway;
1125
1126         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1127         if (n)
1128                 return n;
1129         return neigh_create(&arp_tbl, pkey, dev);
1130 }
1131
1132 static int rt_bind_neighbour(struct rtable *rt)
1133 {
1134         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1135         if (IS_ERR(n))
1136                 return PTR_ERR(n);
1137         dst_set_neighbour(&rt->dst, n);
1138
1139         return 0;
1140 }
1141
1142 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1143                                      struct sk_buff *skb, int ifindex)
1144 {
1145         struct rtable   *rth, *cand;
1146         struct rtable __rcu **rthp, **candp;
1147         unsigned long   now;
1148         u32             min_score;
1149         int             chain_length;
1150         int attempts = !in_softirq();
1151
1152 restart:
1153         chain_length = 0;
1154         min_score = ~(u32)0;
1155         cand = NULL;
1156         candp = NULL;
1157         now = jiffies;
1158
1159         if (!rt_caching(dev_net(rt->dst.dev))) {
1160                 /*
1161                  * If we're not caching, just tell the caller we
1162                  * were successful and don't touch the route.  The
1163                  * caller hold the sole reference to the cache entry, and
1164                  * it will be released when the caller is done with it.
1165                  * If we drop it here, the callers have no way to resolve routes
1166                  * when we're not caching.  Instead, just point *rp at rt, so
1167                  * the caller gets a single use out of the route
1168                  * Note that we do rt_free on this new route entry, so that
1169                  * once its refcount hits zero, we are still able to reap it
1170                  * (Thanks Alexey)
1171                  * Note: To avoid expensive rcu stuff for this uncached dst,
1172                  * we set DST_NOCACHE so that dst_release() can free dst without
1173                  * waiting a grace period.
1174                  */
1175
1176                 rt->dst.flags |= DST_NOCACHE;
1177                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1178                         int err = rt_bind_neighbour(rt);
1179                         if (err) {
1180                                 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1181                                 ip_rt_put(rt);
1182                                 return ERR_PTR(err);
1183                         }
1184                 }
1185
1186                 goto skip_hashing;
1187         }
1188
1189         rthp = &rt_hash_table[hash].chain;
1190
1191         spin_lock_bh(rt_hash_lock_addr(hash));
1192         while ((rth = rcu_dereference_protected(*rthp,
1193                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1194                 if (rt_is_expired(rth)) {
1195                         *rthp = rth->dst.rt_next;
1196                         rt_free(rth);
1197                         continue;
1198                 }
1199                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1200                         /* Put it first */
1201                         *rthp = rth->dst.rt_next;
1202                         /*
1203                          * Since lookup is lockfree, the deletion
1204                          * must be visible to another weakly ordered CPU before
1205                          * the insertion at the start of the hash chain.
1206                          */
1207                         rcu_assign_pointer(rth->dst.rt_next,
1208                                            rt_hash_table[hash].chain);
1209                         /*
1210                          * Since lookup is lockfree, the update writes
1211                          * must be ordered for consistency on SMP.
1212                          */
1213                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1214
1215                         dst_use(&rth->dst, now);
1216                         spin_unlock_bh(rt_hash_lock_addr(hash));
1217
1218                         rt_drop(rt);
1219                         if (skb)
1220                                 skb_dst_set(skb, &rth->dst);
1221                         return rth;
1222                 }
1223
1224                 if (!atomic_read(&rth->dst.__refcnt)) {
1225                         u32 score = rt_score(rth);
1226
1227                         if (score <= min_score) {
1228                                 cand = rth;
1229                                 candp = rthp;
1230                                 min_score = score;
1231                         }
1232                 }
1233
1234                 chain_length++;
1235
1236                 rthp = &rth->dst.rt_next;
1237         }
1238
1239         if (cand) {
1240                 /* ip_rt_gc_elasticity used to be average length of chain
1241                  * length, when exceeded gc becomes really aggressive.
1242                  *
1243                  * The second limit is less certain. At the moment it allows
1244                  * only 2 entries per bucket. We will see.
1245                  */
1246                 if (chain_length > ip_rt_gc_elasticity) {
1247                         *candp = cand->dst.rt_next;
1248                         rt_free(cand);
1249                 }
1250         } else {
1251                 if (chain_length > rt_chain_length_max &&
1252                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1253                         struct net *net = dev_net(rt->dst.dev);
1254                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1255                         if (!rt_caching(net)) {
1256                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1257                                         rt->dst.dev->name, num);
1258                         }
1259                         rt_emergency_hash_rebuild(net);
1260                         spin_unlock_bh(rt_hash_lock_addr(hash));
1261
1262                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1263                                         ifindex, rt_genid(net));
1264                         goto restart;
1265                 }
1266         }
1267
1268         /* Try to bind route to arp only if it is output
1269            route or unicast forwarding path.
1270          */
1271         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1272                 int err = rt_bind_neighbour(rt);
1273                 if (err) {
1274                         spin_unlock_bh(rt_hash_lock_addr(hash));
1275
1276                         if (err != -ENOBUFS) {
1277                                 rt_drop(rt);
1278                                 return ERR_PTR(err);
1279                         }
1280
1281                         /* Neighbour tables are full and nothing
1282                            can be released. Try to shrink route cache,
1283                            it is most likely it holds some neighbour records.
1284                          */
1285                         if (attempts-- > 0) {
1286                                 int saved_elasticity = ip_rt_gc_elasticity;
1287                                 int saved_int = ip_rt_gc_min_interval;
1288                                 ip_rt_gc_elasticity     = 1;
1289                                 ip_rt_gc_min_interval   = 0;
1290                                 rt_garbage_collect(&ipv4_dst_ops);
1291                                 ip_rt_gc_min_interval   = saved_int;
1292                                 ip_rt_gc_elasticity     = saved_elasticity;
1293                                 goto restart;
1294                         }
1295
1296                         net_warn_ratelimited("Neighbour table overflow\n");
1297                         rt_drop(rt);
1298                         return ERR_PTR(-ENOBUFS);
1299                 }
1300         }
1301
1302         rt->dst.rt_next = rt_hash_table[hash].chain;
1303
1304         /*
1305          * Since lookup is lockfree, we must make sure
1306          * previous writes to rt are committed to memory
1307          * before making rt visible to other CPUS.
1308          */
1309         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1310
1311         spin_unlock_bh(rt_hash_lock_addr(hash));
1312
1313 skip_hashing:
1314         if (skb)
1315                 skb_dst_set(skb, &rt->dst);
1316         return rt;
1317 }
1318
1319 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1320
1321 static u32 rt_peer_genid(void)
1322 {
1323         return atomic_read(&__rt_peer_genid);
1324 }
1325
1326 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1327 {
1328         struct inet_peer_base *base;
1329         struct inet_peer *peer;
1330
1331         base = inetpeer_base_ptr(rt->_peer);
1332         if (!base)
1333                 return;
1334
1335         peer = inet_getpeer_v4(base, daddr, create);
1336
1337         if (!rt_set_peer(rt, peer))
1338                 inet_putpeer(peer);
1339         else
1340                 rt->rt_peer_genid = rt_peer_genid();
1341 }
1342
1343 /*
1344  * Peer allocation may fail only in serious out-of-memory conditions.  However
1345  * we still can generate some output.
1346  * Random ID selection looks a bit dangerous because we have no chances to
1347  * select ID being unique in a reasonable period of time.
1348  * But broken packet identifier may be better than no packet at all.
1349  */
1350 static void ip_select_fb_ident(struct iphdr *iph)
1351 {
1352         static DEFINE_SPINLOCK(ip_fb_id_lock);
1353         static u32 ip_fallback_id;
1354         u32 salt;
1355
1356         spin_lock_bh(&ip_fb_id_lock);
1357         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1358         iph->id = htons(salt & 0xFFFF);
1359         ip_fallback_id = salt;
1360         spin_unlock_bh(&ip_fb_id_lock);
1361 }
1362
1363 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1364 {
1365         struct rtable *rt = (struct rtable *) dst;
1366
1367         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1368                 struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
1369
1370                 /* If peer is attached to destination, it is never detached,
1371                    so that we need not to grab a lock to dereference it.
1372                  */
1373                 if (peer) {
1374                         iph->id = htons(inet_getid(peer, more));
1375                         return;
1376                 }
1377         } else if (!rt)
1378                 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1379
1380         ip_select_fb_ident(iph);
1381 }
1382 EXPORT_SYMBOL(__ip_select_ident);
1383
1384 static void rt_del(unsigned int hash, struct rtable *rt)
1385 {
1386         struct rtable __rcu **rthp;
1387         struct rtable *aux;
1388
1389         rthp = &rt_hash_table[hash].chain;
1390         spin_lock_bh(rt_hash_lock_addr(hash));
1391         ip_rt_put(rt);
1392         while ((aux = rcu_dereference_protected(*rthp,
1393                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1394                 if (aux == rt || rt_is_expired(aux)) {
1395                         *rthp = aux->dst.rt_next;
1396                         rt_free(aux);
1397                         continue;
1398                 }
1399                 rthp = &aux->dst.rt_next;
1400         }
1401         spin_unlock_bh(rt_hash_lock_addr(hash));
1402 }
1403
1404 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1405 {
1406         struct rtable *rt = (struct rtable *) dst;
1407         __be32 orig_gw = rt->rt_gateway;
1408         struct neighbour *n, *old_n;
1409
1410         dst_confirm(&rt->dst);
1411
1412         rt->rt_gateway = peer->redirect_learned.a4;
1413
1414         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1415         if (IS_ERR(n)) {
1416                 rt->rt_gateway = orig_gw;
1417                 return;
1418         }
1419         old_n = xchg(&rt->dst._neighbour, n);
1420         if (old_n)
1421                 neigh_release(old_n);
1422         if (!(n->nud_state & NUD_VALID)) {
1423                 neigh_event_send(n, NULL);
1424         } else {
1425                 rt->rt_flags |= RTCF_REDIRECTED;
1426                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1427         }
1428 }
1429
1430 /* called in rcu_read_lock() section */
1431 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1432                     __be32 saddr, struct net_device *dev)
1433 {
1434         int s, i;
1435         struct in_device *in_dev = __in_dev_get_rcu(dev);
1436         __be32 skeys[2] = { saddr, 0 };
1437         int    ikeys[2] = { dev->ifindex, 0 };
1438         struct inet_peer *peer;
1439         struct net *net;
1440
1441         if (!in_dev)
1442                 return;
1443
1444         net = dev_net(dev);
1445         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1446             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1447             ipv4_is_zeronet(new_gw))
1448                 goto reject_redirect;
1449
1450         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1451                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1452                         goto reject_redirect;
1453                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1454                         goto reject_redirect;
1455         } else {
1456                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1457                         goto reject_redirect;
1458         }
1459
1460         for (s = 0; s < 2; s++) {
1461                 for (i = 0; i < 2; i++) {
1462                         unsigned int hash;
1463                         struct rtable __rcu **rthp;
1464                         struct rtable *rt;
1465
1466                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1467
1468                         rthp = &rt_hash_table[hash].chain;
1469
1470                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1471                                 rthp = &rt->dst.rt_next;
1472
1473                                 if (rt->rt_key_dst != daddr ||
1474                                     rt->rt_key_src != skeys[s] ||
1475                                     rt->rt_oif != ikeys[i] ||
1476                                     rt_is_input_route(rt) ||
1477                                     rt_is_expired(rt) ||
1478                                     !net_eq(dev_net(rt->dst.dev), net) ||
1479                                     rt->dst.error ||
1480                                     rt->dst.dev != dev ||
1481                                     rt->rt_gateway != old_gw)
1482                                         continue;
1483
1484                                 peer = rt_get_peer_create(rt, rt->rt_dst);
1485                                 if (peer) {
1486                                         if (peer->redirect_learned.a4 != new_gw) {
1487                                                 peer->redirect_learned.a4 = new_gw;
1488                                                 atomic_inc(&__rt_peer_genid);
1489                                         }
1490                                         check_peer_redir(&rt->dst, peer);
1491                                 }
1492                         }
1493                 }
1494         }
1495         return;
1496
1497 reject_redirect:
1498 #ifdef CONFIG_IP_ROUTE_VERBOSE
1499         if (IN_DEV_LOG_MARTIANS(in_dev))
1500                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1501                                      "  Advised path = %pI4 -> %pI4\n",
1502                                      &old_gw, dev->name, &new_gw,
1503                                      &saddr, &daddr);
1504 #endif
1505         ;
1506 }
1507
1508 static bool peer_pmtu_expired(struct inet_peer *peer)
1509 {
1510         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1511
1512         return orig &&
1513                time_after_eq(jiffies, orig) &&
1514                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1515 }
1516
1517 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1518 {
1519         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1520
1521         return orig &&
1522                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1523 }
1524
1525 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1526 {
1527         struct rtable *rt = (struct rtable *)dst;
1528         struct dst_entry *ret = dst;
1529
1530         if (rt) {
1531                 if (dst->obsolete > 0) {
1532                         ip_rt_put(rt);
1533                         ret = NULL;
1534                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1535                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1536                                                 rt->rt_oif,
1537                                                 rt_genid(dev_net(dst->dev)));
1538                         rt_del(hash, rt);
1539                         ret = NULL;
1540                 } else if (rt_has_peer(rt)) {
1541                         struct inet_peer *peer = rt_peer_ptr(rt);
1542                         if (peer_pmtu_expired(peer))
1543                                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1544                 }
1545         }
1546         return ret;
1547 }
1548
1549 /*
1550  * Algorithm:
1551  *      1. The first ip_rt_redirect_number redirects are sent
1552  *         with exponential backoff, then we stop sending them at all,
1553  *         assuming that the host ignores our redirects.
1554  *      2. If we did not see packets requiring redirects
1555  *         during ip_rt_redirect_silence, we assume that the host
1556  *         forgot redirected route and start to send redirects again.
1557  *
1558  * This algorithm is much cheaper and more intelligent than dumb load limiting
1559  * in icmp.c.
1560  *
1561  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1562  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1563  */
1564
1565 void ip_rt_send_redirect(struct sk_buff *skb)
1566 {
1567         struct rtable *rt = skb_rtable(skb);
1568         struct in_device *in_dev;
1569         struct inet_peer *peer;
1570         int log_martians;
1571
1572         rcu_read_lock();
1573         in_dev = __in_dev_get_rcu(rt->dst.dev);
1574         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1575                 rcu_read_unlock();
1576                 return;
1577         }
1578         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1579         rcu_read_unlock();
1580
1581         peer = rt_get_peer_create(rt, rt->rt_dst);
1582         if (!peer) {
1583                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1584                 return;
1585         }
1586
1587         /* No redirected packets during ip_rt_redirect_silence;
1588          * reset the algorithm.
1589          */
1590         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1591                 peer->rate_tokens = 0;
1592
1593         /* Too many ignored redirects; do not send anything
1594          * set dst.rate_last to the last seen redirected packet.
1595          */
1596         if (peer->rate_tokens >= ip_rt_redirect_number) {
1597                 peer->rate_last = jiffies;
1598                 return;
1599         }
1600
1601         /* Check for load limit; set rate_last to the latest sent
1602          * redirect.
1603          */
1604         if (peer->rate_tokens == 0 ||
1605             time_after(jiffies,
1606                        (peer->rate_last +
1607                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1608                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1609                 peer->rate_last = jiffies;
1610                 ++peer->rate_tokens;
1611 #ifdef CONFIG_IP_ROUTE_VERBOSE
1612                 if (log_martians &&
1613                     peer->rate_tokens == ip_rt_redirect_number)
1614                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1615                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1616                                              &rt->rt_dst, &rt->rt_gateway);
1617 #endif
1618         }
1619 }
1620
1621 static int ip_error(struct sk_buff *skb)
1622 {
1623         struct rtable *rt = skb_rtable(skb);
1624         struct inet_peer *peer;
1625         unsigned long now;
1626         bool send;
1627         int code;
1628
1629         switch (rt->dst.error) {
1630         case EINVAL:
1631         default:
1632                 goto out;
1633         case EHOSTUNREACH:
1634                 code = ICMP_HOST_UNREACH;
1635                 break;
1636         case ENETUNREACH:
1637                 code = ICMP_NET_UNREACH;
1638                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1639                                 IPSTATS_MIB_INNOROUTES);
1640                 break;
1641         case EACCES:
1642                 code = ICMP_PKT_FILTERED;
1643                 break;
1644         }
1645
1646         peer = rt_get_peer_create(rt, rt->rt_dst);
1647
1648         send = true;
1649         if (peer) {
1650                 now = jiffies;
1651                 peer->rate_tokens += now - peer->rate_last;
1652                 if (peer->rate_tokens > ip_rt_error_burst)
1653                         peer->rate_tokens = ip_rt_error_burst;
1654                 peer->rate_last = now;
1655                 if (peer->rate_tokens >= ip_rt_error_cost)
1656                         peer->rate_tokens -= ip_rt_error_cost;
1657                 else
1658                         send = false;
1659         }
1660         if (send)
1661                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1662
1663 out:    kfree_skb(skb);
1664         return 0;
1665 }
1666
1667 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1668 {
1669         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1670
1671         if (!expires)
1672                 return;
1673         if (time_before(jiffies, expires)) {
1674                 u32 orig_dst_mtu = dst_mtu(dst);
1675                 if (peer->pmtu_learned < orig_dst_mtu) {
1676                         if (!peer->pmtu_orig)
1677                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1678                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1679                 }
1680         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1681                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1682 }
1683
1684 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1685 {
1686         struct rtable *rt = (struct rtable *) dst;
1687         struct inet_peer *peer;
1688
1689         dst_confirm(dst);
1690
1691         peer = rt_get_peer_create(rt, rt->rt_dst);
1692         if (peer) {
1693                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1694
1695                 if (mtu < ip_rt_min_pmtu)
1696                         mtu = ip_rt_min_pmtu;
1697                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1698
1699                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1700                         if (!pmtu_expires)
1701                                 pmtu_expires = 1UL;
1702
1703                         peer->pmtu_learned = mtu;
1704                         peer->pmtu_expires = pmtu_expires;
1705
1706                         atomic_inc(&__rt_peer_genid);
1707                         rt->rt_peer_genid = rt_peer_genid();
1708                 }
1709                 check_peer_pmtu(dst, peer);
1710         }
1711 }
1712
1713
1714 static void ipv4_validate_peer(struct rtable *rt)
1715 {
1716         if (rt->rt_peer_genid != rt_peer_genid()) {
1717                 struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
1718
1719                 if (peer) {
1720                         check_peer_pmtu(&rt->dst, peer);
1721
1722                         if (peer->redirect_learned.a4 &&
1723                             peer->redirect_learned.a4 != rt->rt_gateway)
1724                                 check_peer_redir(&rt->dst, peer);
1725                 }
1726
1727                 rt->rt_peer_genid = rt_peer_genid();
1728         }
1729 }
1730
1731 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1732 {
1733         struct rtable *rt = (struct rtable *) dst;
1734
1735         if (rt_is_expired(rt))
1736                 return NULL;
1737         ipv4_validate_peer(rt);
1738         return dst;
1739 }
1740
1741 static void ipv4_dst_destroy(struct dst_entry *dst)
1742 {
1743         struct rtable *rt = (struct rtable *) dst;
1744
1745         if (rt->fi) {
1746                 fib_info_put(rt->fi);
1747                 rt->fi = NULL;
1748         }
1749         if (rt_has_peer(rt)) {
1750                 struct inet_peer *peer = rt_peer_ptr(rt);
1751                 inet_putpeer(peer);
1752         }
1753 }
1754
1755
1756 static void ipv4_link_failure(struct sk_buff *skb)
1757 {
1758         struct rtable *rt;
1759
1760         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1761
1762         rt = skb_rtable(skb);
1763         if (rt && rt_has_peer(rt)) {
1764                 struct inet_peer *peer = rt_peer_ptr(rt);
1765                 if (peer_pmtu_cleaned(peer))
1766                         dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
1767         }
1768 }
1769
1770 static int ip_rt_bug(struct sk_buff *skb)
1771 {
1772         pr_debug("%s: %pI4 -> %pI4, %s\n",
1773                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1774                  skb->dev ? skb->dev->name : "?");
1775         kfree_skb(skb);
1776         WARN_ON(1);
1777         return 0;
1778 }
1779
1780 /*
1781    We do not cache source address of outgoing interface,
1782    because it is used only by IP RR, TS and SRR options,
1783    so that it out of fast path.
1784
1785    BTW remember: "addr" is allowed to be not aligned
1786    in IP options!
1787  */
1788
1789 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1790 {
1791         __be32 src;
1792
1793         if (rt_is_output_route(rt))
1794                 src = ip_hdr(skb)->saddr;
1795         else {
1796                 struct fib_result res;
1797                 struct flowi4 fl4;
1798                 struct iphdr *iph;
1799
1800                 iph = ip_hdr(skb);
1801
1802                 memset(&fl4, 0, sizeof(fl4));
1803                 fl4.daddr = iph->daddr;
1804                 fl4.saddr = iph->saddr;
1805                 fl4.flowi4_tos = RT_TOS(iph->tos);
1806                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1807                 fl4.flowi4_iif = skb->dev->ifindex;
1808                 fl4.flowi4_mark = skb->mark;
1809
1810                 rcu_read_lock();
1811                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1812                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1813                 else
1814                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1815                                         RT_SCOPE_UNIVERSE);
1816                 rcu_read_unlock();
1817         }
1818         memcpy(addr, &src, 4);
1819 }
1820
1821 #ifdef CONFIG_IP_ROUTE_CLASSID
1822 static void set_class_tag(struct rtable *rt, u32 tag)
1823 {
1824         if (!(rt->dst.tclassid & 0xFFFF))
1825                 rt->dst.tclassid |= tag & 0xFFFF;
1826         if (!(rt->dst.tclassid & 0xFFFF0000))
1827                 rt->dst.tclassid |= tag & 0xFFFF0000;
1828 }
1829 #endif
1830
1831 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1832 {
1833         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1834
1835         if (advmss == 0) {
1836                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1837                                ip_rt_min_advmss);
1838                 if (advmss > 65535 - 40)
1839                         advmss = 65535 - 40;
1840         }
1841         return advmss;
1842 }
1843
1844 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1845 {
1846         const struct rtable *rt = (const struct rtable *) dst;
1847         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1848
1849         if (mtu && rt_is_output_route(rt))
1850                 return mtu;
1851
1852         mtu = dst->dev->mtu;
1853
1854         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1855
1856                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1857                         mtu = 576;
1858         }
1859
1860         if (mtu > IP_MAX_MTU)
1861                 mtu = IP_MAX_MTU;
1862
1863         return mtu;
1864 }
1865
1866 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1867                             struct fib_info *fi)
1868 {
1869         struct inet_peer_base *base;
1870         struct inet_peer *peer;
1871         int create = 0;
1872
1873         /* If a peer entry exists for this destination, we must hook
1874          * it up in order to get at cached metrics.
1875          */
1876         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1877                 create = 1;
1878
1879         base = inetpeer_base_ptr(rt->_peer);
1880         BUG_ON(!base);
1881
1882         peer = inet_getpeer_v4(base, rt->rt_dst, create);
1883         if (peer) {
1884                 __rt_set_peer(rt, peer);
1885                 rt->rt_peer_genid = rt_peer_genid();
1886                 if (inet_metrics_new(peer))
1887                         memcpy(peer->metrics, fi->fib_metrics,
1888                                sizeof(u32) * RTAX_MAX);
1889                 dst_init_metrics(&rt->dst, peer->metrics, false);
1890
1891                 check_peer_pmtu(&rt->dst, peer);
1892
1893                 if (peer->redirect_learned.a4 &&
1894                     peer->redirect_learned.a4 != rt->rt_gateway) {
1895                         rt->rt_gateway = peer->redirect_learned.a4;
1896                         rt->rt_flags |= RTCF_REDIRECTED;
1897                 }
1898         } else {
1899                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1900                         rt->fi = fi;
1901                         atomic_inc(&fi->fib_clntref);
1902                 }
1903                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1904         }
1905 }
1906
1907 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1908                            const struct fib_result *res,
1909                            struct fib_info *fi, u16 type, u32 itag)
1910 {
1911         struct dst_entry *dst = &rt->dst;
1912
1913         if (fi) {
1914                 if (FIB_RES_GW(*res) &&
1915                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1916                         rt->rt_gateway = FIB_RES_GW(*res);
1917                 rt_init_metrics(rt, fl4, fi);
1918 #ifdef CONFIG_IP_ROUTE_CLASSID
1919                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1920 #endif
1921         }
1922
1923         if (dst_mtu(dst) > IP_MAX_MTU)
1924                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1925         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1926                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1927
1928 #ifdef CONFIG_IP_ROUTE_CLASSID
1929 #ifdef CONFIG_IP_MULTIPLE_TABLES
1930         set_class_tag(rt, fib_rules_tclass(res));
1931 #endif
1932         set_class_tag(rt, itag);
1933 #endif
1934 }
1935
1936 static struct rtable *rt_dst_alloc(struct net_device *dev,
1937                                    bool nopolicy, bool noxfrm)
1938 {
1939         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1940                          DST_HOST |
1941                          (nopolicy ? DST_NOPOLICY : 0) |
1942                          (noxfrm ? DST_NOXFRM : 0));
1943 }
1944
1945 /* called in rcu_read_lock() section */
1946 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1947                                 u8 tos, struct net_device *dev, int our)
1948 {
1949         unsigned int hash;
1950         struct rtable *rth;
1951         __be32 spec_dst;
1952         struct in_device *in_dev = __in_dev_get_rcu(dev);
1953         u32 itag = 0;
1954         int err;
1955
1956         /* Primary sanity checks. */
1957
1958         if (in_dev == NULL)
1959                 return -EINVAL;
1960
1961         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1962             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1963                 goto e_inval;
1964
1965         if (ipv4_is_zeronet(saddr)) {
1966                 if (!ipv4_is_local_multicast(daddr))
1967                         goto e_inval;
1968                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1969         } else {
1970                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1971                                           &itag);
1972                 if (err < 0)
1973                         goto e_err;
1974         }
1975         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1976                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1977         if (!rth)
1978                 goto e_nobufs;
1979
1980 #ifdef CONFIG_IP_ROUTE_CLASSID
1981         rth->dst.tclassid = itag;
1982 #endif
1983         rth->dst.output = ip_rt_bug;
1984
1985         rth->rt_key_dst = daddr;
1986         rth->rt_key_src = saddr;
1987         rth->rt_genid   = rt_genid(dev_net(dev));
1988         rth->rt_flags   = RTCF_MULTICAST;
1989         rth->rt_type    = RTN_MULTICAST;
1990         rth->rt_key_tos = tos;
1991         rth->rt_dst     = daddr;
1992         rth->rt_src     = saddr;
1993         rth->rt_route_iif = dev->ifindex;
1994         rth->rt_iif     = dev->ifindex;
1995         rth->rt_oif     = 0;
1996         rth->rt_mark    = skb->mark;
1997         rth->rt_gateway = daddr;
1998         rth->rt_spec_dst= spec_dst;
1999         rth->rt_peer_genid = 0;
2000         rt_init_peer(rth, dev_net(dev)->ipv4.peers);
2001         rth->fi = NULL;
2002         if (our) {
2003                 rth->dst.input= ip_local_deliver;
2004                 rth->rt_flags |= RTCF_LOCAL;
2005         }
2006
2007 #ifdef CONFIG_IP_MROUTE
2008         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2009                 rth->dst.input = ip_mr_input;
2010 #endif
2011         RT_CACHE_STAT_INC(in_slow_mc);
2012
2013         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2014         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2015         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2016
2017 e_nobufs:
2018         return -ENOBUFS;
2019 e_inval:
2020         return -EINVAL;
2021 e_err:
2022         return err;
2023 }
2024
2025
2026 static void ip_handle_martian_source(struct net_device *dev,
2027                                      struct in_device *in_dev,
2028                                      struct sk_buff *skb,
2029                                      __be32 daddr,
2030                                      __be32 saddr)
2031 {
2032         RT_CACHE_STAT_INC(in_martian_src);
2033 #ifdef CONFIG_IP_ROUTE_VERBOSE
2034         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2035                 /*
2036                  *      RFC1812 recommendation, if source is martian,
2037                  *      the only hint is MAC header.
2038                  */
2039                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2040                         &daddr, &saddr, dev->name);
2041                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2042                         print_hex_dump(KERN_WARNING, "ll header: ",
2043                                        DUMP_PREFIX_OFFSET, 16, 1,
2044                                        skb_mac_header(skb),
2045                                        dev->hard_header_len, true);
2046                 }
2047         }
2048 #endif
2049 }
2050
2051 /* called in rcu_read_lock() section */
2052 static int __mkroute_input(struct sk_buff *skb,
2053                            const struct fib_result *res,
2054                            struct in_device *in_dev,
2055                            __be32 daddr, __be32 saddr, u32 tos,
2056                            struct rtable **result)
2057 {
2058         struct rtable *rth;
2059         int err;
2060         struct in_device *out_dev;
2061         unsigned int flags = 0;
2062         __be32 spec_dst;
2063         u32 itag;
2064
2065         /* get a working reference to the output device */
2066         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2067         if (out_dev == NULL) {
2068                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2069                 return -EINVAL;
2070         }
2071
2072
2073         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2074                                   in_dev->dev, &spec_dst, &itag);
2075         if (err < 0) {
2076                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2077                                          saddr);
2078
2079                 goto cleanup;
2080         }
2081
2082         if (err)
2083                 flags |= RTCF_DIRECTSRC;
2084
2085         if (out_dev == in_dev && err &&
2086             (IN_DEV_SHARED_MEDIA(out_dev) ||
2087              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2088                 flags |= RTCF_DOREDIRECT;
2089
2090         if (skb->protocol != htons(ETH_P_IP)) {
2091                 /* Not IP (i.e. ARP). Do not create route, if it is
2092                  * invalid for proxy arp. DNAT routes are always valid.
2093                  *
2094                  * Proxy arp feature have been extended to allow, ARP
2095                  * replies back to the same interface, to support
2096                  * Private VLAN switch technologies. See arp.c.
2097                  */
2098                 if (out_dev == in_dev &&
2099                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2100                         err = -EINVAL;
2101                         goto cleanup;
2102                 }
2103         }
2104
2105         rth = rt_dst_alloc(out_dev->dev,
2106                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2107                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2108         if (!rth) {
2109                 err = -ENOBUFS;
2110                 goto cleanup;
2111         }
2112
2113         rth->rt_key_dst = daddr;
2114         rth->rt_key_src = saddr;
2115         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2116         rth->rt_flags = flags;
2117         rth->rt_type = res->type;
2118         rth->rt_key_tos = tos;
2119         rth->rt_dst     = daddr;
2120         rth->rt_src     = saddr;
2121         rth->rt_route_iif = in_dev->dev->ifindex;
2122         rth->rt_iif     = in_dev->dev->ifindex;
2123         rth->rt_oif     = 0;
2124         rth->rt_mark    = skb->mark;
2125         rth->rt_gateway = daddr;
2126         rth->rt_spec_dst= spec_dst;
2127         rth->rt_peer_genid = 0;
2128         rt_init_peer(rth, &res->table->tb_peers);
2129         rth->fi = NULL;
2130
2131         rth->dst.input = ip_forward;
2132         rth->dst.output = ip_output;
2133
2134         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2135
2136         *result = rth;
2137         err = 0;
2138  cleanup:
2139         return err;
2140 }
2141
2142 static int ip_mkroute_input(struct sk_buff *skb,
2143                             struct fib_result *res,
2144                             const struct flowi4 *fl4,
2145                             struct in_device *in_dev,
2146                             __be32 daddr, __be32 saddr, u32 tos)
2147 {
2148         struct rtable *rth = NULL;
2149         int err;
2150         unsigned int hash;
2151
2152 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2153         if (res->fi && res->fi->fib_nhs > 1)
2154                 fib_select_multipath(res);
2155 #endif
2156
2157         /* create a routing cache entry */
2158         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2159         if (err)
2160                 return err;
2161
2162         /* put it into the cache */
2163         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2164                        rt_genid(dev_net(rth->dst.dev)));
2165         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2166         if (IS_ERR(rth))
2167                 return PTR_ERR(rth);
2168         return 0;
2169 }
2170
2171 /*
2172  *      NOTE. We drop all the packets that has local source
2173  *      addresses, because every properly looped back packet
2174  *      must have correct destination already attached by output routine.
2175  *
2176  *      Such approach solves two big problems:
2177  *      1. Not simplex devices are handled properly.
2178  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2179  *      called with rcu_read_lock()
2180  */
2181
2182 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2183                                u8 tos, struct net_device *dev)
2184 {
2185         struct fib_result res;
2186         struct in_device *in_dev = __in_dev_get_rcu(dev);
2187         struct flowi4   fl4;
2188         unsigned int    flags = 0;
2189         u32             itag = 0;
2190         struct rtable   *rth;
2191         unsigned int    hash;
2192         __be32          spec_dst;
2193         int             err = -EINVAL;
2194         struct net    *net = dev_net(dev);
2195
2196         /* IP on this device is disabled. */
2197
2198         if (!in_dev)
2199                 goto out;
2200
2201         /* Check for the most weird martians, which can be not detected
2202            by fib_lookup.
2203          */
2204
2205         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2206             ipv4_is_loopback(saddr))
2207                 goto martian_source;
2208
2209         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2210                 goto brd_input;
2211
2212         /* Accept zero addresses only to limited broadcast;
2213          * I even do not know to fix it or not. Waiting for complains :-)
2214          */
2215         if (ipv4_is_zeronet(saddr))
2216                 goto martian_source;
2217
2218         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2219                 goto martian_destination;
2220
2221         /*
2222          *      Now we are ready to route packet.
2223          */
2224         fl4.flowi4_oif = 0;
2225         fl4.flowi4_iif = dev->ifindex;
2226         fl4.flowi4_mark = skb->mark;
2227         fl4.flowi4_tos = tos;
2228         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2229         fl4.daddr = daddr;
2230         fl4.saddr = saddr;
2231         err = fib_lookup(net, &fl4, &res);
2232         if (err != 0) {
2233                 if (!IN_DEV_FORWARD(in_dev))
2234                         goto e_hostunreach;
2235                 goto no_route;
2236         }
2237
2238         RT_CACHE_STAT_INC(in_slow_tot);
2239
2240         if (res.type == RTN_BROADCAST)
2241                 goto brd_input;
2242
2243         if (res.type == RTN_LOCAL) {
2244                 err = fib_validate_source(skb, saddr, daddr, tos,
2245                                           net->loopback_dev->ifindex,
2246                                           dev, &spec_dst, &itag);
2247                 if (err < 0)
2248                         goto martian_source_keep_err;
2249                 if (err)
2250                         flags |= RTCF_DIRECTSRC;
2251                 spec_dst = daddr;
2252                 goto local_input;
2253         }
2254
2255         if (!IN_DEV_FORWARD(in_dev))
2256                 goto e_hostunreach;
2257         if (res.type != RTN_UNICAST)
2258                 goto martian_destination;
2259
2260         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2261 out:    return err;
2262
2263 brd_input:
2264         if (skb->protocol != htons(ETH_P_IP))
2265                 goto e_inval;
2266
2267         if (ipv4_is_zeronet(saddr))
2268                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2269         else {
2270                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2271                                           &itag);
2272                 if (err < 0)
2273                         goto martian_source_keep_err;
2274                 if (err)
2275                         flags |= RTCF_DIRECTSRC;
2276         }
2277         flags |= RTCF_BROADCAST;
2278         res.type = RTN_BROADCAST;
2279         RT_CACHE_STAT_INC(in_brd);
2280
2281 local_input:
2282         rth = rt_dst_alloc(net->loopback_dev,
2283                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2284         if (!rth)
2285                 goto e_nobufs;
2286
2287         rth->dst.input= ip_local_deliver;
2288         rth->dst.output= ip_rt_bug;
2289 #ifdef CONFIG_IP_ROUTE_CLASSID
2290         rth->dst.tclassid = itag;
2291 #endif
2292
2293         rth->rt_key_dst = daddr;
2294         rth->rt_key_src = saddr;
2295         rth->rt_genid = rt_genid(net);
2296         rth->rt_flags   = flags|RTCF_LOCAL;
2297         rth->rt_type    = res.type;
2298         rth->rt_key_tos = tos;
2299         rth->rt_dst     = daddr;
2300         rth->rt_src     = saddr;
2301 #ifdef CONFIG_IP_ROUTE_CLASSID
2302         rth->dst.tclassid = itag;
2303 #endif
2304         rth->rt_route_iif = dev->ifindex;
2305         rth->rt_iif     = dev->ifindex;
2306         rth->rt_oif     = 0;
2307         rth->rt_mark    = skb->mark;
2308         rth->rt_gateway = daddr;
2309         rth->rt_spec_dst= spec_dst;
2310         rth->rt_peer_genid = 0;
2311         rt_init_peer(rth, net->ipv4.peers);
2312         rth->fi = NULL;
2313         if (res.type == RTN_UNREACHABLE) {
2314                 rth->dst.input= ip_error;
2315                 rth->dst.error= -err;
2316                 rth->rt_flags   &= ~RTCF_LOCAL;
2317         }
2318         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2319         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2320         err = 0;
2321         if (IS_ERR(rth))
2322                 err = PTR_ERR(rth);
2323         goto out;
2324
2325 no_route:
2326         RT_CACHE_STAT_INC(in_no_route);
2327         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2328         res.type = RTN_UNREACHABLE;
2329         if (err == -ESRCH)
2330                 err = -ENETUNREACH;
2331         goto local_input;
2332
2333         /*
2334          *      Do not cache martian addresses: they should be logged (RFC1812)
2335          */
2336 martian_destination:
2337         RT_CACHE_STAT_INC(in_martian_dst);
2338 #ifdef CONFIG_IP_ROUTE_VERBOSE
2339         if (IN_DEV_LOG_MARTIANS(in_dev))
2340                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2341                                      &daddr, &saddr, dev->name);
2342 #endif
2343
2344 e_hostunreach:
2345         err = -EHOSTUNREACH;
2346         goto out;
2347
2348 e_inval:
2349         err = -EINVAL;
2350         goto out;
2351
2352 e_nobufs:
2353         err = -ENOBUFS;
2354         goto out;
2355
2356 martian_source:
2357         err = -EINVAL;
2358 martian_source_keep_err:
2359         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2360         goto out;
2361 }
2362
2363 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2364                            u8 tos, struct net_device *dev, bool noref)
2365 {
2366         struct rtable   *rth;
2367         unsigned int    hash;
2368         int iif = dev->ifindex;
2369         struct net *net;
2370         int res;
2371
2372         net = dev_net(dev);
2373
2374         rcu_read_lock();
2375
2376         if (!rt_caching(net))
2377                 goto skip_cache;
2378
2379         tos &= IPTOS_RT_MASK;
2380         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2381
2382         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2383              rth = rcu_dereference(rth->dst.rt_next)) {
2384                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2385                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2386                      (rth->rt_route_iif ^ iif) |
2387                      (rth->rt_key_tos ^ tos)) == 0 &&
2388                     rth->rt_mark == skb->mark &&
2389                     net_eq(dev_net(rth->dst.dev), net) &&
2390                     !rt_is_expired(rth)) {
2391                         ipv4_validate_peer(rth);
2392                         if (noref) {
2393                                 dst_use_noref(&rth->dst, jiffies);
2394                                 skb_dst_set_noref(skb, &rth->dst);
2395                         } else {
2396                                 dst_use(&rth->dst, jiffies);
2397                                 skb_dst_set(skb, &rth->dst);
2398                         }
2399                         RT_CACHE_STAT_INC(in_hit);
2400                         rcu_read_unlock();
2401                         return 0;
2402                 }
2403                 RT_CACHE_STAT_INC(in_hlist_search);
2404         }
2405
2406 skip_cache:
2407         /* Multicast recognition logic is moved from route cache to here.
2408            The problem was that too many Ethernet cards have broken/missing
2409            hardware multicast filters :-( As result the host on multicasting
2410            network acquires a lot of useless route cache entries, sort of
2411            SDR messages from all the world. Now we try to get rid of them.
2412            Really, provided software IP multicast filter is organized
2413            reasonably (at least, hashed), it does not result in a slowdown
2414            comparing with route cache reject entries.
2415            Note, that multicast routers are not affected, because
2416            route cache entry is created eventually.
2417          */
2418         if (ipv4_is_multicast(daddr)) {
2419                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2420
2421                 if (in_dev) {
2422                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2423                                                   ip_hdr(skb)->protocol);
2424                         if (our
2425 #ifdef CONFIG_IP_MROUTE
2426                                 ||
2427                             (!ipv4_is_local_multicast(daddr) &&
2428                              IN_DEV_MFORWARD(in_dev))
2429 #endif
2430                            ) {
2431                                 int res = ip_route_input_mc(skb, daddr, saddr,
2432                                                             tos, dev, our);
2433                                 rcu_read_unlock();
2434                                 return res;
2435                         }
2436                 }
2437                 rcu_read_unlock();
2438                 return -EINVAL;
2439         }
2440         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2441         rcu_read_unlock();
2442         return res;
2443 }
2444 EXPORT_SYMBOL(ip_route_input_common);
2445
2446 /* called with rcu_read_lock() */
2447 static struct rtable *__mkroute_output(const struct fib_result *res,
2448                                        const struct flowi4 *fl4,
2449                                        __be32 orig_daddr, __be32 orig_saddr,
2450                                        int orig_oif, __u8 orig_rtos,
2451                                        struct net_device *dev_out,
2452                                        unsigned int flags)
2453 {
2454         struct fib_info *fi = res->fi;
2455         struct in_device *in_dev;
2456         u16 type = res->type;
2457         struct rtable *rth;
2458
2459         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2460                 return ERR_PTR(-EINVAL);
2461
2462         if (ipv4_is_lbcast(fl4->daddr))
2463                 type = RTN_BROADCAST;
2464         else if (ipv4_is_multicast(fl4->daddr))
2465                 type = RTN_MULTICAST;
2466         else if (ipv4_is_zeronet(fl4->daddr))
2467                 return ERR_PTR(-EINVAL);
2468
2469         if (dev_out->flags & IFF_LOOPBACK)
2470                 flags |= RTCF_LOCAL;
2471
2472         in_dev = __in_dev_get_rcu(dev_out);
2473         if (!in_dev)
2474                 return ERR_PTR(-EINVAL);
2475
2476         if (type == RTN_BROADCAST) {
2477                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2478                 fi = NULL;
2479         } else if (type == RTN_MULTICAST) {
2480                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2481                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2482                                      fl4->flowi4_proto))
2483                         flags &= ~RTCF_LOCAL;
2484                 /* If multicast route do not exist use
2485                  * default one, but do not gateway in this case.
2486                  * Yes, it is hack.
2487                  */
2488                 if (fi && res->prefixlen < 4)
2489                         fi = NULL;
2490         }
2491
2492         rth = rt_dst_alloc(dev_out,
2493                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2494                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2495         if (!rth)
2496                 return ERR_PTR(-ENOBUFS);
2497
2498         rth->dst.output = ip_output;
2499
2500         rth->rt_key_dst = orig_daddr;
2501         rth->rt_key_src = orig_saddr;
2502         rth->rt_genid = rt_genid(dev_net(dev_out));
2503         rth->rt_flags   = flags;
2504         rth->rt_type    = type;
2505         rth->rt_key_tos = orig_rtos;
2506         rth->rt_dst     = fl4->daddr;
2507         rth->rt_src     = fl4->saddr;
2508         rth->rt_route_iif = 0;
2509         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2510         rth->rt_oif     = orig_oif;
2511         rth->rt_mark    = fl4->flowi4_mark;
2512         rth->rt_gateway = fl4->daddr;
2513         rth->rt_spec_dst= fl4->saddr;
2514         rth->rt_peer_genid = 0;
2515         rt_init_peer(rth, (res->table ?
2516                            &res->table->tb_peers :
2517                            dev_net(dev_out)->ipv4.peers));
2518         rth->fi = NULL;
2519
2520         RT_CACHE_STAT_INC(out_slow_tot);
2521
2522         if (flags & RTCF_LOCAL) {
2523                 rth->dst.input = ip_local_deliver;
2524                 rth->rt_spec_dst = fl4->daddr;
2525         }
2526         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2527                 rth->rt_spec_dst = fl4->saddr;
2528                 if (flags & RTCF_LOCAL &&
2529                     !(dev_out->flags & IFF_LOOPBACK)) {
2530                         rth->dst.output = ip_mc_output;
2531                         RT_CACHE_STAT_INC(out_slow_mc);
2532                 }
2533 #ifdef CONFIG_IP_MROUTE
2534                 if (type == RTN_MULTICAST) {
2535                         if (IN_DEV_MFORWARD(in_dev) &&
2536                             !ipv4_is_local_multicast(fl4->daddr)) {
2537                                 rth->dst.input = ip_mr_input;
2538                                 rth->dst.output = ip_mc_output;
2539                         }
2540                 }
2541 #endif
2542         }
2543
2544         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2545
2546         return rth;
2547 }
2548
2549 /*
2550  * Major route resolver routine.
2551  * called with rcu_read_lock();
2552  */
2553
2554 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2555 {
2556         struct net_device *dev_out = NULL;
2557         __u8 tos = RT_FL_TOS(fl4);
2558         unsigned int flags = 0;
2559         struct fib_result res;
2560         struct rtable *rth;
2561         __be32 orig_daddr;
2562         __be32 orig_saddr;
2563         int orig_oif;
2564
2565         res.fi          = NULL;
2566         res.table       = NULL;
2567 #ifdef CONFIG_IP_MULTIPLE_TABLES
2568         res.r           = NULL;
2569 #endif
2570
2571         orig_daddr = fl4->daddr;
2572         orig_saddr = fl4->saddr;
2573         orig_oif = fl4->flowi4_oif;
2574
2575         fl4->flowi4_iif = net->loopback_dev->ifindex;
2576         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2577         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2578                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2579
2580         rcu_read_lock();
2581         if (fl4->saddr) {
2582                 rth = ERR_PTR(-EINVAL);
2583                 if (ipv4_is_multicast(fl4->saddr) ||
2584                     ipv4_is_lbcast(fl4->saddr) ||
2585                     ipv4_is_zeronet(fl4->saddr))
2586                         goto out;
2587
2588                 /* I removed check for oif == dev_out->oif here.
2589                    It was wrong for two reasons:
2590                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2591                       is assigned to multiple interfaces.
2592                    2. Moreover, we are allowed to send packets with saddr
2593                       of another iface. --ANK
2594                  */
2595
2596                 if (fl4->flowi4_oif == 0 &&
2597                     (ipv4_is_multicast(fl4->daddr) ||
2598                      ipv4_is_lbcast(fl4->daddr))) {
2599                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2600                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2601                         if (dev_out == NULL)
2602                                 goto out;
2603
2604                         /* Special hack: user can direct multicasts
2605                            and limited broadcast via necessary interface
2606                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2607                            This hack is not just for fun, it allows
2608                            vic,vat and friends to work.
2609                            They bind socket to loopback, set ttl to zero
2610                            and expect that it will work.
2611                            From the viewpoint of routing cache they are broken,
2612                            because we are not allowed to build multicast path
2613                            with loopback source addr (look, routing cache
2614                            cannot know, that ttl is zero, so that packet
2615                            will not leave this host and route is valid).
2616                            Luckily, this hack is good workaround.
2617                          */
2618
2619                         fl4->flowi4_oif = dev_out->ifindex;
2620                         goto make_route;
2621                 }
2622
2623                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2624                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2625                         if (!__ip_dev_find(net, fl4->saddr, false))
2626                                 goto out;
2627                 }
2628         }
2629
2630
2631         if (fl4->flowi4_oif) {
2632                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2633                 rth = ERR_PTR(-ENODEV);
2634                 if (dev_out == NULL)
2635                         goto out;
2636
2637                 /* RACE: Check return value of inet_select_addr instead. */
2638                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2639                         rth = ERR_PTR(-ENETUNREACH);
2640                         goto out;
2641                 }
2642                 if (ipv4_is_local_multicast(fl4->daddr) ||
2643                     ipv4_is_lbcast(fl4->daddr)) {
2644                         if (!fl4->saddr)
2645                                 fl4->saddr = inet_select_addr(dev_out, 0,
2646                                                               RT_SCOPE_LINK);
2647                         goto make_route;
2648                 }
2649                 if (fl4->saddr) {
2650                         if (ipv4_is_multicast(fl4->daddr))
2651                                 fl4->saddr = inet_select_addr(dev_out, 0,
2652                                                               fl4->flowi4_scope);
2653                         else if (!fl4->daddr)
2654                                 fl4->saddr = inet_select_addr(dev_out, 0,
2655                                                               RT_SCOPE_HOST);
2656                 }
2657         }
2658
2659         if (!fl4->daddr) {
2660                 fl4->daddr = fl4->saddr;
2661                 if (!fl4->daddr)
2662                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2663                 dev_out = net->loopback_dev;
2664                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2665                 res.type = RTN_LOCAL;
2666                 flags |= RTCF_LOCAL;
2667                 goto make_route;
2668         }
2669
2670         if (fib_lookup(net, fl4, &res)) {
2671                 res.fi = NULL;
2672                 res.table = NULL;
2673                 if (fl4->flowi4_oif) {
2674                         /* Apparently, routing tables are wrong. Assume,
2675                            that the destination is on link.
2676
2677                            WHY? DW.
2678                            Because we are allowed to send to iface
2679                            even if it has NO routes and NO assigned
2680                            addresses. When oif is specified, routing
2681                            tables are looked up with only one purpose:
2682                            to catch if destination is gatewayed, rather than
2683                            direct. Moreover, if MSG_DONTROUTE is set,
2684                            we send packet, ignoring both routing tables
2685                            and ifaddr state. --ANK
2686
2687
2688                            We could make it even if oif is unknown,
2689                            likely IPv6, but we do not.
2690                          */
2691
2692                         if (fl4->saddr == 0)
2693                                 fl4->saddr = inet_select_addr(dev_out, 0,
2694                                                               RT_SCOPE_LINK);
2695                         res.type = RTN_UNICAST;
2696                         goto make_route;
2697                 }
2698                 rth = ERR_PTR(-ENETUNREACH);
2699                 goto out;
2700         }
2701
2702         if (res.type == RTN_LOCAL) {
2703                 if (!fl4->saddr) {
2704                         if (res.fi->fib_prefsrc)
2705                                 fl4->saddr = res.fi->fib_prefsrc;
2706                         else
2707                                 fl4->saddr = fl4->daddr;
2708                 }
2709                 dev_out = net->loopback_dev;
2710                 fl4->flowi4_oif = dev_out->ifindex;
2711                 res.fi = NULL;
2712                 flags |= RTCF_LOCAL;
2713                 goto make_route;
2714         }
2715
2716 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2717         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2718                 fib_select_multipath(&res);
2719         else
2720 #endif
2721         if (!res.prefixlen &&
2722             res.table->tb_num_default > 1 &&
2723             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2724                 fib_select_default(&res);
2725
2726         if (!fl4->saddr)
2727                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2728
2729         dev_out = FIB_RES_DEV(res);
2730         fl4->flowi4_oif = dev_out->ifindex;
2731
2732
2733 make_route:
2734         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2735                                tos, dev_out, flags);
2736         if (!IS_ERR(rth)) {
2737                 unsigned int hash;
2738
2739                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2740                                rt_genid(dev_net(dev_out)));
2741                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2742         }
2743
2744 out:
2745         rcu_read_unlock();
2746         return rth;
2747 }
2748
2749 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2750 {
2751         struct rtable *rth;
2752         unsigned int hash;
2753
2754         if (!rt_caching(net))
2755                 goto slow_output;
2756
2757         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2758
2759         rcu_read_lock_bh();
2760         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2761                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2762                 if (rth->rt_key_dst == flp4->daddr &&
2763                     rth->rt_key_src == flp4->saddr &&
2764                     rt_is_output_route(rth) &&
2765                     rth->rt_oif == flp4->flowi4_oif &&
2766                     rth->rt_mark == flp4->flowi4_mark &&
2767                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2768                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2769                     net_eq(dev_net(rth->dst.dev), net) &&
2770                     !rt_is_expired(rth)) {
2771                         ipv4_validate_peer(rth);
2772                         dst_use(&rth->dst, jiffies);
2773                         RT_CACHE_STAT_INC(out_hit);
2774                         rcu_read_unlock_bh();
2775                         if (!flp4->saddr)
2776                                 flp4->saddr = rth->rt_src;
2777                         if (!flp4->daddr)
2778                                 flp4->daddr = rth->rt_dst;
2779                         return rth;
2780                 }
2781                 RT_CACHE_STAT_INC(out_hlist_search);
2782         }
2783         rcu_read_unlock_bh();
2784
2785 slow_output:
2786         return ip_route_output_slow(net, flp4);
2787 }
2788 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2789
2790 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2791 {
2792         return NULL;
2793 }
2794
2795 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2796 {
2797         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2798
2799         return mtu ? : dst->dev->mtu;
2800 }
2801
2802 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2803 {
2804 }
2805
2806 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2807                                           unsigned long old)
2808 {
2809         return NULL;
2810 }
2811
2812 static struct dst_ops ipv4_dst_blackhole_ops = {
2813         .family                 =       AF_INET,
2814         .protocol               =       cpu_to_be16(ETH_P_IP),
2815         .destroy                =       ipv4_dst_destroy,
2816         .check                  =       ipv4_blackhole_dst_check,
2817         .mtu                    =       ipv4_blackhole_mtu,
2818         .default_advmss         =       ipv4_default_advmss,
2819         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2820         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2821         .neigh_lookup           =       ipv4_neigh_lookup,
2822 };
2823
2824 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2825 {
2826         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2827         struct rtable *ort = (struct rtable *) dst_orig;
2828
2829         if (rt) {
2830                 struct dst_entry *new = &rt->dst;
2831
2832                 new->__use = 1;
2833                 new->input = dst_discard;
2834                 new->output = dst_discard;
2835                 dst_copy_metrics(new, &ort->dst);
2836
2837                 new->dev = ort->dst.dev;
2838                 if (new->dev)
2839                         dev_hold(new->dev);
2840
2841                 rt->rt_key_dst = ort->rt_key_dst;
2842                 rt->rt_key_src = ort->rt_key_src;
2843                 rt->rt_key_tos = ort->rt_key_tos;
2844                 rt->rt_route_iif = ort->rt_route_iif;
2845                 rt->rt_iif = ort->rt_iif;
2846                 rt->rt_oif = ort->rt_oif;
2847                 rt->rt_mark = ort->rt_mark;
2848
2849                 rt->rt_genid = rt_genid(net);
2850                 rt->rt_flags = ort->rt_flags;
2851                 rt->rt_type = ort->rt_type;
2852                 rt->rt_dst = ort->rt_dst;
2853                 rt->rt_src = ort->rt_src;
2854                 rt->rt_gateway = ort->rt_gateway;
2855                 rt->rt_spec_dst = ort->rt_spec_dst;
2856                 rt_transfer_peer(rt, ort);
2857                 rt->fi = ort->fi;
2858                 if (rt->fi)
2859                         atomic_inc(&rt->fi->fib_clntref);
2860
2861                 dst_free(new);
2862         }
2863
2864         dst_release(dst_orig);
2865
2866         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2867 }
2868
2869 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2870                                     struct sock *sk)
2871 {
2872         struct rtable *rt = __ip_route_output_key(net, flp4);
2873
2874         if (IS_ERR(rt))
2875                 return rt;
2876
2877         if (flp4->flowi4_proto)
2878                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2879                                                    flowi4_to_flowi(flp4),
2880                                                    sk, 0);
2881
2882         return rt;
2883 }
2884 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2885
2886 static int rt_fill_info(struct net *net,
2887                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2888                         int nowait, unsigned int flags)
2889 {
2890         struct rtable *rt = skb_rtable(skb);
2891         struct rtmsg *r;
2892         struct nlmsghdr *nlh;
2893         unsigned long expires = 0;
2894         u32 id = 0, ts = 0, tsage = 0, error;
2895
2896         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2897         if (nlh == NULL)
2898                 return -EMSGSIZE;
2899
2900         r = nlmsg_data(nlh);
2901         r->rtm_family    = AF_INET;
2902         r->rtm_dst_len  = 32;
2903         r->rtm_src_len  = 0;
2904         r->rtm_tos      = rt->rt_key_tos;
2905         r->rtm_table    = RT_TABLE_MAIN;
2906         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2907                 goto nla_put_failure;
2908         r->rtm_type     = rt->rt_type;
2909         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2910         r->rtm_protocol = RTPROT_UNSPEC;
2911         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2912         if (rt->rt_flags & RTCF_NOTIFY)
2913                 r->rtm_flags |= RTM_F_NOTIFY;
2914
2915         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2916                 goto nla_put_failure;
2917         if (rt->rt_key_src) {
2918                 r->rtm_src_len = 32;
2919                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2920                         goto nla_put_failure;
2921         }
2922         if (rt->dst.dev &&
2923             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2924                 goto nla_put_failure;
2925 #ifdef CONFIG_IP_ROUTE_CLASSID
2926         if (rt->dst.tclassid &&
2927             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2928                 goto nla_put_failure;
2929 #endif
2930         if (rt_is_input_route(rt)) {
2931                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
2932                         goto nla_put_failure;
2933         } else if (rt->rt_src != rt->rt_key_src) {
2934                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2935                         goto nla_put_failure;
2936         }
2937         if (rt->rt_dst != rt->rt_gateway &&
2938             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2939                 goto nla_put_failure;
2940
2941         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2942                 goto nla_put_failure;
2943
2944         if (rt->rt_mark &&
2945             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2946                 goto nla_put_failure;
2947
2948         error = rt->dst.error;
2949         if (rt_has_peer(rt)) {
2950                 const struct inet_peer *peer = rt_peer_ptr(rt);
2951                 inet_peer_refcheck(peer);
2952                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2953                 if (peer->tcp_ts_stamp) {
2954                         ts = peer->tcp_ts;
2955                         tsage = get_seconds() - peer->tcp_ts_stamp;
2956                 }
2957                 expires = ACCESS_ONCE(peer->pmtu_expires);
2958                 if (expires) {
2959                         if (time_before(jiffies, expires))
2960                                 expires -= jiffies;
2961                         else
2962                                 expires = 0;
2963                 }
2964         }
2965
2966         if (rt_is_input_route(rt)) {
2967 #ifdef CONFIG_IP_MROUTE
2968                 __be32 dst = rt->rt_dst;
2969
2970                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2971                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2972                         int err = ipmr_get_route(net, skb,
2973                                                  rt->rt_src, rt->rt_dst,
2974                                                  r, nowait);
2975                         if (err <= 0) {
2976                                 if (!nowait) {
2977                                         if (err == 0)
2978                                                 return 0;
2979                                         goto nla_put_failure;
2980                                 } else {
2981                                         if (err == -EMSGSIZE)
2982                                                 goto nla_put_failure;
2983                                         error = err;
2984                                 }
2985                         }
2986                 } else
2987 #endif
2988                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2989                                 goto nla_put_failure;
2990         }
2991
2992         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2993                                expires, error) < 0)
2994                 goto nla_put_failure;
2995
2996         return nlmsg_end(skb, nlh);
2997
2998 nla_put_failure:
2999         nlmsg_cancel(skb, nlh);
3000         return -EMSGSIZE;
3001 }
3002
3003 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3004 {
3005         struct net *net = sock_net(in_skb->sk);
3006         struct rtmsg *rtm;
3007         struct nlattr *tb[RTA_MAX+1];
3008         struct rtable *rt = NULL;
3009         __be32 dst = 0;
3010         __be32 src = 0;
3011         u32 iif;
3012         int err;
3013         int mark;
3014         struct sk_buff *skb;
3015
3016         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3017         if (err < 0)
3018                 goto errout;
3019
3020         rtm = nlmsg_data(nlh);
3021
3022         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3023         if (skb == NULL) {
3024                 err = -ENOBUFS;
3025                 goto errout;
3026         }
3027
3028         /* Reserve room for dummy headers, this skb can pass
3029            through good chunk of routing engine.
3030          */
3031         skb_reset_mac_header(skb);
3032         skb_reset_network_header(skb);
3033
3034         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3035         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3036         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3037
3038         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3039         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3040         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3041         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3042
3043         if (iif) {
3044                 struct net_device *dev;
3045
3046                 dev = __dev_get_by_index(net, iif);
3047                 if (dev == NULL) {
3048                         err = -ENODEV;
3049                         goto errout_free;
3050                 }
3051
3052                 skb->protocol   = htons(ETH_P_IP);
3053                 skb->dev        = dev;
3054                 skb->mark       = mark;
3055                 local_bh_disable();
3056                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3057                 local_bh_enable();
3058
3059                 rt = skb_rtable(skb);
3060                 if (err == 0 && rt->dst.error)
3061                         err = -rt->dst.error;
3062         } else {
3063                 struct flowi4 fl4 = {
3064                         .daddr = dst,
3065                         .saddr = src,
3066                         .flowi4_tos = rtm->rtm_tos,
3067                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3068                         .flowi4_mark = mark,
3069                 };
3070                 rt = ip_route_output_key(net, &fl4);
3071
3072                 err = 0;
3073                 if (IS_ERR(rt))
3074                         err = PTR_ERR(rt);
3075         }
3076
3077         if (err)
3078                 goto errout_free;
3079
3080         skb_dst_set(skb, &rt->dst);
3081         if (rtm->rtm_flags & RTM_F_NOTIFY)
3082                 rt->rt_flags |= RTCF_NOTIFY;
3083
3084         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3085                            RTM_NEWROUTE, 0, 0);
3086         if (err <= 0)
3087                 goto errout_free;
3088
3089         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3090 errout:
3091         return err;
3092
3093 errout_free:
3094         kfree_skb(skb);
3095         goto errout;
3096 }
3097
3098 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3099 {
3100         struct rtable *rt;
3101         int h, s_h;
3102         int idx, s_idx;
3103         struct net *net;
3104
3105         net = sock_net(skb->sk);
3106
3107         s_h = cb->args[0];
3108         if (s_h < 0)
3109                 s_h = 0;
3110         s_idx = idx = cb->args[1];
3111         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3112                 if (!rt_hash_table[h].chain)
3113                         continue;
3114                 rcu_read_lock_bh();
3115                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3116                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3117                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3118                                 continue;
3119                         if (rt_is_expired(rt))
3120                                 continue;
3121                         skb_dst_set_noref(skb, &rt->dst);
3122                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3123                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3124                                          1, NLM_F_MULTI) <= 0) {
3125                                 skb_dst_drop(skb);
3126                                 rcu_read_unlock_bh();
3127                                 goto done;
3128                         }
3129                         skb_dst_drop(skb);
3130                 }
3131                 rcu_read_unlock_bh();
3132         }
3133
3134 done:
3135         cb->args[0] = h;
3136         cb->args[1] = idx;
3137         return skb->len;
3138 }
3139
3140 void ip_rt_multicast_event(struct in_device *in_dev)
3141 {
3142         rt_cache_flush(dev_net(in_dev->dev), 0);
3143 }
3144
3145 #ifdef CONFIG_SYSCTL
3146 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3147                                         void __user *buffer,
3148                                         size_t *lenp, loff_t *ppos)
3149 {
3150         if (write) {
3151                 int flush_delay;
3152                 ctl_table ctl;
3153                 struct net *net;
3154
3155                 memcpy(&ctl, __ctl, sizeof(ctl));
3156                 ctl.data = &flush_delay;
3157                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3158
3159                 net = (struct net *)__ctl->extra1;
3160                 rt_cache_flush(net, flush_delay);
3161                 return 0;
3162         }
3163
3164         return -EINVAL;
3165 }
3166
3167 static ctl_table ipv4_route_table[] = {
3168         {
3169                 .procname       = "gc_thresh",
3170                 .data           = &ipv4_dst_ops.gc_thresh,
3171                 .maxlen         = sizeof(int),
3172                 .mode           = 0644,
3173                 .proc_handler   = proc_dointvec,
3174         },
3175         {
3176                 .procname       = "max_size",
3177                 .data           = &ip_rt_max_size,
3178                 .maxlen         = sizeof(int),
3179                 .mode           = 0644,
3180                 .proc_handler   = proc_dointvec,
3181         },
3182         {
3183                 /*  Deprecated. Use gc_min_interval_ms */
3184
3185                 .procname       = "gc_min_interval",
3186                 .data           = &ip_rt_gc_min_interval,
3187                 .maxlen         = sizeof(int),
3188                 .mode           = 0644,
3189                 .proc_handler   = proc_dointvec_jiffies,
3190         },
3191         {
3192                 .procname       = "gc_min_interval_ms",
3193                 .data           = &ip_rt_gc_min_interval,
3194                 .maxlen         = sizeof(int),
3195                 .mode           = 0644,
3196                 .proc_handler   = proc_dointvec_ms_jiffies,
3197         },
3198         {
3199                 .procname       = "gc_timeout",
3200                 .data           = &ip_rt_gc_timeout,
3201                 .maxlen         = sizeof(int),
3202                 .mode           = 0644,
3203                 .proc_handler   = proc_dointvec_jiffies,
3204         },
3205         {
3206                 .procname       = "gc_interval",
3207                 .data           = &ip_rt_gc_interval,
3208                 .maxlen         = sizeof(int),
3209                 .mode           = 0644,
3210                 .proc_handler   = proc_dointvec_jiffies,
3211         },
3212         {
3213                 .procname       = "redirect_load",
3214                 .data           = &ip_rt_redirect_load,
3215                 .maxlen         = sizeof(int),
3216                 .mode           = 0644,
3217                 .proc_handler   = proc_dointvec,
3218         },
3219         {
3220                 .procname       = "redirect_number",
3221                 .data           = &ip_rt_redirect_number,
3222                 .maxlen         = sizeof(int),
3223                 .mode           = 0644,
3224                 .proc_handler   = proc_dointvec,
3225         },
3226         {
3227                 .procname       = "redirect_silence",
3228                 .data           = &ip_rt_redirect_silence,
3229                 .maxlen         = sizeof(int),
3230                 .mode           = 0644,
3231                 .proc_handler   = proc_dointvec,
3232         },
3233         {
3234                 .procname       = "error_cost",
3235                 .data           = &ip_rt_error_cost,
3236                 .maxlen         = sizeof(int),
3237                 .mode           = 0644,
3238                 .proc_handler   = proc_dointvec,
3239         },
3240         {
3241                 .procname       = "error_burst",
3242                 .data           = &ip_rt_error_burst,
3243                 .maxlen         = sizeof(int),
3244                 .mode           = 0644,
3245                 .proc_handler   = proc_dointvec,
3246         },
3247         {
3248                 .procname       = "gc_elasticity",
3249                 .data           = &ip_rt_gc_elasticity,
3250                 .maxlen         = sizeof(int),
3251                 .mode           = 0644,
3252                 .proc_handler   = proc_dointvec,
3253         },
3254         {
3255                 .procname       = "mtu_expires",
3256                 .data           = &ip_rt_mtu_expires,
3257                 .maxlen         = sizeof(int),
3258                 .mode           = 0644,
3259                 .proc_handler   = proc_dointvec_jiffies,
3260         },
3261         {
3262                 .procname       = "min_pmtu",
3263                 .data           = &ip_rt_min_pmtu,
3264                 .maxlen         = sizeof(int),
3265                 .mode           = 0644,
3266                 .proc_handler   = proc_dointvec,
3267         },
3268         {
3269                 .procname       = "min_adv_mss",
3270                 .data           = &ip_rt_min_advmss,
3271                 .maxlen         = sizeof(int),
3272                 .mode           = 0644,
3273                 .proc_handler   = proc_dointvec,
3274         },
3275         { }
3276 };
3277
3278 static struct ctl_table ipv4_route_flush_table[] = {
3279         {
3280                 .procname       = "flush",
3281                 .maxlen         = sizeof(int),
3282                 .mode           = 0200,
3283                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3284         },
3285         { },
3286 };
3287
3288 static __net_init int sysctl_route_net_init(struct net *net)
3289 {
3290         struct ctl_table *tbl;
3291
3292         tbl = ipv4_route_flush_table;
3293         if (!net_eq(net, &init_net)) {
3294                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3295                 if (tbl == NULL)
3296                         goto err_dup;
3297         }
3298         tbl[0].extra1 = net;
3299
3300         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3301         if (net->ipv4.route_hdr == NULL)
3302                 goto err_reg;
3303         return 0;
3304
3305 err_reg:
3306         if (tbl != ipv4_route_flush_table)
3307                 kfree(tbl);
3308 err_dup:
3309         return -ENOMEM;
3310 }
3311
3312 static __net_exit void sysctl_route_net_exit(struct net *net)
3313 {
3314         struct ctl_table *tbl;
3315
3316         tbl = net->ipv4.route_hdr->ctl_table_arg;
3317         unregister_net_sysctl_table(net->ipv4.route_hdr);
3318         BUG_ON(tbl == ipv4_route_flush_table);
3319         kfree(tbl);
3320 }
3321
3322 static __net_initdata struct pernet_operations sysctl_route_ops = {
3323         .init = sysctl_route_net_init,
3324         .exit = sysctl_route_net_exit,
3325 };
3326 #endif
3327
3328 static __net_init int rt_genid_init(struct net *net)
3329 {
3330         get_random_bytes(&net->ipv4.rt_genid,
3331                          sizeof(net->ipv4.rt_genid));
3332         get_random_bytes(&net->ipv4.dev_addr_genid,
3333                          sizeof(net->ipv4.dev_addr_genid));
3334         return 0;
3335 }
3336
3337 static __net_initdata struct pernet_operations rt_genid_ops = {
3338         .init = rt_genid_init,
3339 };
3340
3341 static int __net_init ipv4_inetpeer_init(struct net *net)
3342 {
3343         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3344
3345         if (!bp)
3346                 return -ENOMEM;
3347         inet_peer_base_init(bp);
3348         net->ipv4.peers = bp;
3349         return 0;
3350 }
3351
3352 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3353 {
3354         struct inet_peer_base *bp = net->ipv4.peers;
3355
3356         net->ipv4.peers = NULL;
3357         inetpeer_invalidate_tree(bp);
3358         kfree(bp);
3359 }
3360
3361 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3362         .init   =       ipv4_inetpeer_init,
3363         .exit   =       ipv4_inetpeer_exit,
3364 };
3365
3366 #ifdef CONFIG_IP_ROUTE_CLASSID
3367 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3368 #endif /* CONFIG_IP_ROUTE_CLASSID */
3369
3370 static __initdata unsigned long rhash_entries;
3371 static int __init set_rhash_entries(char *str)
3372 {
3373         ssize_t ret;
3374
3375         if (!str)
3376                 return 0;
3377
3378         ret = kstrtoul(str, 0, &rhash_entries);
3379         if (ret)
3380                 return 0;
3381
3382         return 1;
3383 }
3384 __setup("rhash_entries=", set_rhash_entries);
3385
3386 int __init ip_rt_init(void)
3387 {
3388         int rc = 0;
3389
3390 #ifdef CONFIG_IP_ROUTE_CLASSID
3391         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3392         if (!ip_rt_acct)
3393                 panic("IP: failed to allocate ip_rt_acct\n");
3394 #endif
3395
3396         ipv4_dst_ops.kmem_cachep =
3397                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3398                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3399
3400         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3401
3402         if (dst_entries_init(&ipv4_dst_ops) < 0)
3403                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3404
3405         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3406                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3407
3408         rt_hash_table = (struct rt_hash_bucket *)
3409                 alloc_large_system_hash("IP route cache",
3410                                         sizeof(struct rt_hash_bucket),
3411                                         rhash_entries,
3412                                         (totalram_pages >= 128 * 1024) ?
3413                                         15 : 17,
3414                                         0,
3415                                         &rt_hash_log,
3416                                         &rt_hash_mask,
3417                                         0,
3418                                         rhash_entries ? 0 : 512 * 1024);
3419         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3420         rt_hash_lock_init();
3421
3422         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3423         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3424
3425         devinet_init();
3426         ip_fib_init();
3427
3428         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3429         expires_ljiffies = jiffies;
3430         schedule_delayed_work(&expires_work,
3431                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3432
3433         if (ip_rt_proc_init())
3434                 pr_err("Unable to create route proc files\n");
3435 #ifdef CONFIG_XFRM
3436         xfrm_init();
3437         xfrm4_init(ip_rt_max_size);
3438 #endif
3439         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3440
3441 #ifdef CONFIG_SYSCTL
3442         register_pernet_subsys(&sysctl_route_ops);
3443 #endif
3444         register_pernet_subsys(&rt_genid_ops);
3445         register_pernet_subsys(&ipv4_inetpeer_ops);
3446         return rc;
3447 }
3448
3449 #ifdef CONFIG_SYSCTL
3450 /*
3451  * We really need to sanitize the damn ipv4 init order, then all
3452  * this nonsense will go away.
3453  */
3454 void __init ip_static_sysctl_init(void)
3455 {
3456         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3457 }
3458 #endif