]> git.karo-electronics.de Git - karo-tx-linux.git/blob - net/ipv4/route.c
inet: Consolidate inetpeer_invalidate_tree() interfaces.
[karo-tx-linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
96 #include <net/dst.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
99 #include <net/ip.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
104 #include <net/arp.h>
105 #include <net/tcp.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
113 #endif
114 #include <net/secure_seq.h>
115
116 #define RT_FL_TOS(oldflp4) \
117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define IP_MAX_MTU      0xFFF0
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
127 static int ip_rt_redirect_number __read_mostly  = 9;
128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly       = HZ;
131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly    = 8;
133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly       = 256;
136 static int rt_chain_length_max __read_mostly    = 20;
137
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
148 static void              ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153
154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
155                             int how)
156 {
157 }
158
159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160 {
161         struct rtable *rt = (struct rtable *) dst;
162         struct inet_peer *peer;
163         u32 *p = NULL;
164
165         peer = rt_get_peer_create(rt, rt->rt_dst);
166         if (peer) {
167                 u32 *old_p = __DST_METRICS_PTR(old);
168                 unsigned long prev, new;
169
170                 p = peer->metrics;
171                 if (inet_metrics_new(peer))
172                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
173
174                 new = (unsigned long) p;
175                 prev = cmpxchg(&dst->_metrics, old, new);
176
177                 if (prev != old) {
178                         p = __DST_METRICS_PTR(prev);
179                         if (prev & DST_METRICS_READ_ONLY)
180                                 p = NULL;
181                 } else {
182                         if (rt->fi) {
183                                 fib_info_put(rt->fi);
184                                 rt->fi = NULL;
185                         }
186                 }
187         }
188         return p;
189 }
190
191 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
192
193 static struct dst_ops ipv4_dst_ops = {
194         .family =               AF_INET,
195         .protocol =             cpu_to_be16(ETH_P_IP),
196         .gc =                   rt_garbage_collect,
197         .check =                ipv4_dst_check,
198         .default_advmss =       ipv4_default_advmss,
199         .mtu =                  ipv4_mtu,
200         .cow_metrics =          ipv4_cow_metrics,
201         .destroy =              ipv4_dst_destroy,
202         .ifdown =               ipv4_dst_ifdown,
203         .negative_advice =      ipv4_negative_advice,
204         .link_failure =         ipv4_link_failure,
205         .update_pmtu =          ip_rt_update_pmtu,
206         .local_out =            __ip_local_out,
207         .neigh_lookup =         ipv4_neigh_lookup,
208 };
209
210 #define ECN_OR_COST(class)      TC_PRIO_##class
211
212 const __u8 ip_tos2prio[16] = {
213         TC_PRIO_BESTEFFORT,
214         ECN_OR_COST(BESTEFFORT),
215         TC_PRIO_BESTEFFORT,
216         ECN_OR_COST(BESTEFFORT),
217         TC_PRIO_BULK,
218         ECN_OR_COST(BULK),
219         TC_PRIO_BULK,
220         ECN_OR_COST(BULK),
221         TC_PRIO_INTERACTIVE,
222         ECN_OR_COST(INTERACTIVE),
223         TC_PRIO_INTERACTIVE,
224         ECN_OR_COST(INTERACTIVE),
225         TC_PRIO_INTERACTIVE_BULK,
226         ECN_OR_COST(INTERACTIVE_BULK),
227         TC_PRIO_INTERACTIVE_BULK,
228         ECN_OR_COST(INTERACTIVE_BULK)
229 };
230 EXPORT_SYMBOL(ip_tos2prio);
231
232 /*
233  * Route cache.
234  */
235
236 /* The locking scheme is rather straight forward:
237  *
238  * 1) Read-Copy Update protects the buckets of the central route hash.
239  * 2) Only writers remove entries, and they hold the lock
240  *    as they look at rtable reference counts.
241  * 3) Only readers acquire references to rtable entries,
242  *    they do so with atomic increments and with the
243  *    lock held.
244  */
245
246 struct rt_hash_bucket {
247         struct rtable __rcu     *chain;
248 };
249
250 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
251         defined(CONFIG_PROVE_LOCKING)
252 /*
253  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
254  * The size of this table is a power of two and depends on the number of CPUS.
255  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
256  */
257 #ifdef CONFIG_LOCKDEP
258 # define RT_HASH_LOCK_SZ        256
259 #else
260 # if NR_CPUS >= 32
261 #  define RT_HASH_LOCK_SZ       4096
262 # elif NR_CPUS >= 16
263 #  define RT_HASH_LOCK_SZ       2048
264 # elif NR_CPUS >= 8
265 #  define RT_HASH_LOCK_SZ       1024
266 # elif NR_CPUS >= 4
267 #  define RT_HASH_LOCK_SZ       512
268 # else
269 #  define RT_HASH_LOCK_SZ       256
270 # endif
271 #endif
272
273 static spinlock_t       *rt_hash_locks;
274 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
275
276 static __init void rt_hash_lock_init(void)
277 {
278         int i;
279
280         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
281                         GFP_KERNEL);
282         if (!rt_hash_locks)
283                 panic("IP: failed to allocate rt_hash_locks\n");
284
285         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
286                 spin_lock_init(&rt_hash_locks[i]);
287 }
288 #else
289 # define rt_hash_lock_addr(slot) NULL
290
291 static inline void rt_hash_lock_init(void)
292 {
293 }
294 #endif
295
296 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
297 static unsigned int             rt_hash_mask __read_mostly;
298 static unsigned int             rt_hash_log  __read_mostly;
299
300 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
301 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
302
303 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
304                                    int genid)
305 {
306         return jhash_3words((__force u32)daddr, (__force u32)saddr,
307                             idx, genid)
308                 & rt_hash_mask;
309 }
310
311 static inline int rt_genid(struct net *net)
312 {
313         return atomic_read(&net->ipv4.rt_genid);
314 }
315
316 #ifdef CONFIG_PROC_FS
317 struct rt_cache_iter_state {
318         struct seq_net_private p;
319         int bucket;
320         int genid;
321 };
322
323 static struct rtable *rt_cache_get_first(struct seq_file *seq)
324 {
325         struct rt_cache_iter_state *st = seq->private;
326         struct rtable *r = NULL;
327
328         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
329                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
330                         continue;
331                 rcu_read_lock_bh();
332                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
333                 while (r) {
334                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
335                             r->rt_genid == st->genid)
336                                 return r;
337                         r = rcu_dereference_bh(r->dst.rt_next);
338                 }
339                 rcu_read_unlock_bh();
340         }
341         return r;
342 }
343
344 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
345                                           struct rtable *r)
346 {
347         struct rt_cache_iter_state *st = seq->private;
348
349         r = rcu_dereference_bh(r->dst.rt_next);
350         while (!r) {
351                 rcu_read_unlock_bh();
352                 do {
353                         if (--st->bucket < 0)
354                                 return NULL;
355                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
356                 rcu_read_lock_bh();
357                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
358         }
359         return r;
360 }
361
362 static struct rtable *rt_cache_get_next(struct seq_file *seq,
363                                         struct rtable *r)
364 {
365         struct rt_cache_iter_state *st = seq->private;
366         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
367                 if (dev_net(r->dst.dev) != seq_file_net(seq))
368                         continue;
369                 if (r->rt_genid == st->genid)
370                         break;
371         }
372         return r;
373 }
374
375 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
376 {
377         struct rtable *r = rt_cache_get_first(seq);
378
379         if (r)
380                 while (pos && (r = rt_cache_get_next(seq, r)))
381                         --pos;
382         return pos ? NULL : r;
383 }
384
385 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
386 {
387         struct rt_cache_iter_state *st = seq->private;
388         if (*pos)
389                 return rt_cache_get_idx(seq, *pos - 1);
390         st->genid = rt_genid(seq_file_net(seq));
391         return SEQ_START_TOKEN;
392 }
393
394 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
395 {
396         struct rtable *r;
397
398         if (v == SEQ_START_TOKEN)
399                 r = rt_cache_get_first(seq);
400         else
401                 r = rt_cache_get_next(seq, v);
402         ++*pos;
403         return r;
404 }
405
406 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
407 {
408         if (v && v != SEQ_START_TOKEN)
409                 rcu_read_unlock_bh();
410 }
411
412 static int rt_cache_seq_show(struct seq_file *seq, void *v)
413 {
414         if (v == SEQ_START_TOKEN)
415                 seq_printf(seq, "%-127s\n",
416                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
417                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
418                            "HHUptod\tSpecDst");
419         else {
420                 struct rtable *r = v;
421                 struct neighbour *n;
422                 int len, HHUptod;
423
424                 rcu_read_lock();
425                 n = dst_get_neighbour_noref(&r->dst);
426                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
427                 rcu_read_unlock();
428
429                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
430                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
431                         r->dst.dev ? r->dst.dev->name : "*",
432                         (__force u32)r->rt_dst,
433                         (__force u32)r->rt_gateway,
434                         r->rt_flags, atomic_read(&r->dst.__refcnt),
435                         r->dst.__use, 0, (__force u32)r->rt_src,
436                         dst_metric_advmss(&r->dst) + 40,
437                         dst_metric(&r->dst, RTAX_WINDOW),
438                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
439                               dst_metric(&r->dst, RTAX_RTTVAR)),
440                         r->rt_key_tos,
441                         -1,
442                         HHUptod,
443                         r->rt_spec_dst, &len);
444
445                 seq_printf(seq, "%*s\n", 127 - len, "");
446         }
447         return 0;
448 }
449
450 static const struct seq_operations rt_cache_seq_ops = {
451         .start  = rt_cache_seq_start,
452         .next   = rt_cache_seq_next,
453         .stop   = rt_cache_seq_stop,
454         .show   = rt_cache_seq_show,
455 };
456
457 static int rt_cache_seq_open(struct inode *inode, struct file *file)
458 {
459         return seq_open_net(inode, file, &rt_cache_seq_ops,
460                         sizeof(struct rt_cache_iter_state));
461 }
462
463 static const struct file_operations rt_cache_seq_fops = {
464         .owner   = THIS_MODULE,
465         .open    = rt_cache_seq_open,
466         .read    = seq_read,
467         .llseek  = seq_lseek,
468         .release = seq_release_net,
469 };
470
471
472 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
473 {
474         int cpu;
475
476         if (*pos == 0)
477                 return SEQ_START_TOKEN;
478
479         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
480                 if (!cpu_possible(cpu))
481                         continue;
482                 *pos = cpu+1;
483                 return &per_cpu(rt_cache_stat, cpu);
484         }
485         return NULL;
486 }
487
488 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
489 {
490         int cpu;
491
492         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
493                 if (!cpu_possible(cpu))
494                         continue;
495                 *pos = cpu+1;
496                 return &per_cpu(rt_cache_stat, cpu);
497         }
498         return NULL;
499
500 }
501
502 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
503 {
504
505 }
506
507 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
508 {
509         struct rt_cache_stat *st = v;
510
511         if (v == SEQ_START_TOKEN) {
512                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
513                 return 0;
514         }
515
516         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
517                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
518                    dst_entries_get_slow(&ipv4_dst_ops),
519                    st->in_hit,
520                    st->in_slow_tot,
521                    st->in_slow_mc,
522                    st->in_no_route,
523                    st->in_brd,
524                    st->in_martian_dst,
525                    st->in_martian_src,
526
527                    st->out_hit,
528                    st->out_slow_tot,
529                    st->out_slow_mc,
530
531                    st->gc_total,
532                    st->gc_ignored,
533                    st->gc_goal_miss,
534                    st->gc_dst_overflow,
535                    st->in_hlist_search,
536                    st->out_hlist_search
537                 );
538         return 0;
539 }
540
541 static const struct seq_operations rt_cpu_seq_ops = {
542         .start  = rt_cpu_seq_start,
543         .next   = rt_cpu_seq_next,
544         .stop   = rt_cpu_seq_stop,
545         .show   = rt_cpu_seq_show,
546 };
547
548
549 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
550 {
551         return seq_open(file, &rt_cpu_seq_ops);
552 }
553
554 static const struct file_operations rt_cpu_seq_fops = {
555         .owner   = THIS_MODULE,
556         .open    = rt_cpu_seq_open,
557         .read    = seq_read,
558         .llseek  = seq_lseek,
559         .release = seq_release,
560 };
561
562 #ifdef CONFIG_IP_ROUTE_CLASSID
563 static int rt_acct_proc_show(struct seq_file *m, void *v)
564 {
565         struct ip_rt_acct *dst, *src;
566         unsigned int i, j;
567
568         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
569         if (!dst)
570                 return -ENOMEM;
571
572         for_each_possible_cpu(i) {
573                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
574                 for (j = 0; j < 256; j++) {
575                         dst[j].o_bytes   += src[j].o_bytes;
576                         dst[j].o_packets += src[j].o_packets;
577                         dst[j].i_bytes   += src[j].i_bytes;
578                         dst[j].i_packets += src[j].i_packets;
579                 }
580         }
581
582         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
583         kfree(dst);
584         return 0;
585 }
586
587 static int rt_acct_proc_open(struct inode *inode, struct file *file)
588 {
589         return single_open(file, rt_acct_proc_show, NULL);
590 }
591
592 static const struct file_operations rt_acct_proc_fops = {
593         .owner          = THIS_MODULE,
594         .open           = rt_acct_proc_open,
595         .read           = seq_read,
596         .llseek         = seq_lseek,
597         .release        = single_release,
598 };
599 #endif
600
601 static int __net_init ip_rt_do_proc_init(struct net *net)
602 {
603         struct proc_dir_entry *pde;
604
605         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
606                         &rt_cache_seq_fops);
607         if (!pde)
608                 goto err1;
609
610         pde = proc_create("rt_cache", S_IRUGO,
611                           net->proc_net_stat, &rt_cpu_seq_fops);
612         if (!pde)
613                 goto err2;
614
615 #ifdef CONFIG_IP_ROUTE_CLASSID
616         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
617         if (!pde)
618                 goto err3;
619 #endif
620         return 0;
621
622 #ifdef CONFIG_IP_ROUTE_CLASSID
623 err3:
624         remove_proc_entry("rt_cache", net->proc_net_stat);
625 #endif
626 err2:
627         remove_proc_entry("rt_cache", net->proc_net);
628 err1:
629         return -ENOMEM;
630 }
631
632 static void __net_exit ip_rt_do_proc_exit(struct net *net)
633 {
634         remove_proc_entry("rt_cache", net->proc_net_stat);
635         remove_proc_entry("rt_cache", net->proc_net);
636 #ifdef CONFIG_IP_ROUTE_CLASSID
637         remove_proc_entry("rt_acct", net->proc_net);
638 #endif
639 }
640
641 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
642         .init = ip_rt_do_proc_init,
643         .exit = ip_rt_do_proc_exit,
644 };
645
646 static int __init ip_rt_proc_init(void)
647 {
648         return register_pernet_subsys(&ip_rt_proc_ops);
649 }
650
651 #else
652 static inline int ip_rt_proc_init(void)
653 {
654         return 0;
655 }
656 #endif /* CONFIG_PROC_FS */
657
658 static inline void rt_free(struct rtable *rt)
659 {
660         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
661 }
662
663 static inline void rt_drop(struct rtable *rt)
664 {
665         ip_rt_put(rt);
666         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
667 }
668
669 static inline int rt_fast_clean(struct rtable *rth)
670 {
671         /* Kill broadcast/multicast entries very aggresively, if they
672            collide in hash table with more useful entries */
673         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
674                 rt_is_input_route(rth) && rth->dst.rt_next;
675 }
676
677 static inline int rt_valuable(struct rtable *rth)
678 {
679         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
680                 (rth->peer && rth->peer->pmtu_expires);
681 }
682
683 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
684 {
685         unsigned long age;
686         int ret = 0;
687
688         if (atomic_read(&rth->dst.__refcnt))
689                 goto out;
690
691         age = jiffies - rth->dst.lastuse;
692         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
693             (age <= tmo2 && rt_valuable(rth)))
694                 goto out;
695         ret = 1;
696 out:    return ret;
697 }
698
699 /* Bits of score are:
700  * 31: very valuable
701  * 30: not quite useless
702  * 29..0: usage counter
703  */
704 static inline u32 rt_score(struct rtable *rt)
705 {
706         u32 score = jiffies - rt->dst.lastuse;
707
708         score = ~score & ~(3<<30);
709
710         if (rt_valuable(rt))
711                 score |= (1<<31);
712
713         if (rt_is_output_route(rt) ||
714             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
715                 score |= (1<<30);
716
717         return score;
718 }
719
720 static inline bool rt_caching(const struct net *net)
721 {
722         return net->ipv4.current_rt_cache_rebuild_count <=
723                 net->ipv4.sysctl_rt_cache_rebuild_count;
724 }
725
726 static inline bool compare_hash_inputs(const struct rtable *rt1,
727                                        const struct rtable *rt2)
728 {
729         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
730                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
731                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
732 }
733
734 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
735 {
736         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
737                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
738                 (rt1->rt_mark ^ rt2->rt_mark) |
739                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
740                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
741                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
742 }
743
744 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
745 {
746         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
747 }
748
749 static inline int rt_is_expired(struct rtable *rth)
750 {
751         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
752 }
753
754 /*
755  * Perform a full scan of hash table and free all entries.
756  * Can be called by a softirq or a process.
757  * In the later case, we want to be reschedule if necessary
758  */
759 static void rt_do_flush(struct net *net, int process_context)
760 {
761         unsigned int i;
762         struct rtable *rth, *next;
763
764         for (i = 0; i <= rt_hash_mask; i++) {
765                 struct rtable __rcu **pprev;
766                 struct rtable *list;
767
768                 if (process_context && need_resched())
769                         cond_resched();
770                 rth = rcu_access_pointer(rt_hash_table[i].chain);
771                 if (!rth)
772                         continue;
773
774                 spin_lock_bh(rt_hash_lock_addr(i));
775
776                 list = NULL;
777                 pprev = &rt_hash_table[i].chain;
778                 rth = rcu_dereference_protected(*pprev,
779                         lockdep_is_held(rt_hash_lock_addr(i)));
780
781                 while (rth) {
782                         next = rcu_dereference_protected(rth->dst.rt_next,
783                                 lockdep_is_held(rt_hash_lock_addr(i)));
784
785                         if (!net ||
786                             net_eq(dev_net(rth->dst.dev), net)) {
787                                 rcu_assign_pointer(*pprev, next);
788                                 rcu_assign_pointer(rth->dst.rt_next, list);
789                                 list = rth;
790                         } else {
791                                 pprev = &rth->dst.rt_next;
792                         }
793                         rth = next;
794                 }
795
796                 spin_unlock_bh(rt_hash_lock_addr(i));
797
798                 for (; list; list = next) {
799                         next = rcu_dereference_protected(list->dst.rt_next, 1);
800                         rt_free(list);
801                 }
802         }
803 }
804
805 /*
806  * While freeing expired entries, we compute average chain length
807  * and standard deviation, using fixed-point arithmetic.
808  * This to have an estimation of rt_chain_length_max
809  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
810  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
811  */
812
813 #define FRACT_BITS 3
814 #define ONE (1UL << FRACT_BITS)
815
816 /*
817  * Given a hash chain and an item in this hash chain,
818  * find if a previous entry has the same hash_inputs
819  * (but differs on tos, mark or oif)
820  * Returns 0 if an alias is found.
821  * Returns ONE if rth has no alias before itself.
822  */
823 static int has_noalias(const struct rtable *head, const struct rtable *rth)
824 {
825         const struct rtable *aux = head;
826
827         while (aux != rth) {
828                 if (compare_hash_inputs(aux, rth))
829                         return 0;
830                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
831         }
832         return ONE;
833 }
834
835 static void rt_check_expire(void)
836 {
837         static unsigned int rover;
838         unsigned int i = rover, goal;
839         struct rtable *rth;
840         struct rtable __rcu **rthp;
841         unsigned long samples = 0;
842         unsigned long sum = 0, sum2 = 0;
843         unsigned long delta;
844         u64 mult;
845
846         delta = jiffies - expires_ljiffies;
847         expires_ljiffies = jiffies;
848         mult = ((u64)delta) << rt_hash_log;
849         if (ip_rt_gc_timeout > 1)
850                 do_div(mult, ip_rt_gc_timeout);
851         goal = (unsigned int)mult;
852         if (goal > rt_hash_mask)
853                 goal = rt_hash_mask + 1;
854         for (; goal > 0; goal--) {
855                 unsigned long tmo = ip_rt_gc_timeout;
856                 unsigned long length;
857
858                 i = (i + 1) & rt_hash_mask;
859                 rthp = &rt_hash_table[i].chain;
860
861                 if (need_resched())
862                         cond_resched();
863
864                 samples++;
865
866                 if (rcu_dereference_raw(*rthp) == NULL)
867                         continue;
868                 length = 0;
869                 spin_lock_bh(rt_hash_lock_addr(i));
870                 while ((rth = rcu_dereference_protected(*rthp,
871                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
872                         prefetch(rth->dst.rt_next);
873                         if (rt_is_expired(rth)) {
874                                 *rthp = rth->dst.rt_next;
875                                 rt_free(rth);
876                                 continue;
877                         }
878                         if (rth->dst.expires) {
879                                 /* Entry is expired even if it is in use */
880                                 if (time_before_eq(jiffies, rth->dst.expires)) {
881 nofree:
882                                         tmo >>= 1;
883                                         rthp = &rth->dst.rt_next;
884                                         /*
885                                          * We only count entries on
886                                          * a chain with equal hash inputs once
887                                          * so that entries for different QOS
888                                          * levels, and other non-hash input
889                                          * attributes don't unfairly skew
890                                          * the length computation
891                                          */
892                                         length += has_noalias(rt_hash_table[i].chain, rth);
893                                         continue;
894                                 }
895                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
896                                 goto nofree;
897
898                         /* Cleanup aged off entries. */
899                         *rthp = rth->dst.rt_next;
900                         rt_free(rth);
901                 }
902                 spin_unlock_bh(rt_hash_lock_addr(i));
903                 sum += length;
904                 sum2 += length*length;
905         }
906         if (samples) {
907                 unsigned long avg = sum / samples;
908                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
909                 rt_chain_length_max = max_t(unsigned long,
910                                         ip_rt_gc_elasticity,
911                                         (avg + 4*sd) >> FRACT_BITS);
912         }
913         rover = i;
914 }
915
916 /*
917  * rt_worker_func() is run in process context.
918  * we call rt_check_expire() to scan part of the hash table
919  */
920 static void rt_worker_func(struct work_struct *work)
921 {
922         rt_check_expire();
923         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
924 }
925
926 /*
927  * Perturbation of rt_genid by a small quantity [1..256]
928  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
929  * many times (2^24) without giving recent rt_genid.
930  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
931  */
932 static void rt_cache_invalidate(struct net *net)
933 {
934         unsigned char shuffle;
935
936         get_random_bytes(&shuffle, sizeof(shuffle));
937         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
938         inetpeer_invalidate_tree(net->ipv4.peers);
939 }
940
941 /*
942  * delay < 0  : invalidate cache (fast : entries will be deleted later)
943  * delay >= 0 : invalidate & flush cache (can be long)
944  */
945 void rt_cache_flush(struct net *net, int delay)
946 {
947         rt_cache_invalidate(net);
948         if (delay >= 0)
949                 rt_do_flush(net, !in_softirq());
950 }
951
952 /* Flush previous cache invalidated entries from the cache */
953 void rt_cache_flush_batch(struct net *net)
954 {
955         rt_do_flush(net, !in_softirq());
956 }
957
958 static void rt_emergency_hash_rebuild(struct net *net)
959 {
960         net_warn_ratelimited("Route hash chain too long!\n");
961         rt_cache_invalidate(net);
962 }
963
964 /*
965    Short description of GC goals.
966
967    We want to build algorithm, which will keep routing cache
968    at some equilibrium point, when number of aged off entries
969    is kept approximately equal to newly generated ones.
970
971    Current expiration strength is variable "expire".
972    We try to adjust it dynamically, so that if networking
973    is idle expires is large enough to keep enough of warm entries,
974    and when load increases it reduces to limit cache size.
975  */
976
977 static int rt_garbage_collect(struct dst_ops *ops)
978 {
979         static unsigned long expire = RT_GC_TIMEOUT;
980         static unsigned long last_gc;
981         static int rover;
982         static int equilibrium;
983         struct rtable *rth;
984         struct rtable __rcu **rthp;
985         unsigned long now = jiffies;
986         int goal;
987         int entries = dst_entries_get_fast(&ipv4_dst_ops);
988
989         /*
990          * Garbage collection is pretty expensive,
991          * do not make it too frequently.
992          */
993
994         RT_CACHE_STAT_INC(gc_total);
995
996         if (now - last_gc < ip_rt_gc_min_interval &&
997             entries < ip_rt_max_size) {
998                 RT_CACHE_STAT_INC(gc_ignored);
999                 goto out;
1000         }
1001
1002         entries = dst_entries_get_slow(&ipv4_dst_ops);
1003         /* Calculate number of entries, which we want to expire now. */
1004         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1005         if (goal <= 0) {
1006                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1007                         equilibrium = ipv4_dst_ops.gc_thresh;
1008                 goal = entries - equilibrium;
1009                 if (goal > 0) {
1010                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1011                         goal = entries - equilibrium;
1012                 }
1013         } else {
1014                 /* We are in dangerous area. Try to reduce cache really
1015                  * aggressively.
1016                  */
1017                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1018                 equilibrium = entries - goal;
1019         }
1020
1021         if (now - last_gc >= ip_rt_gc_min_interval)
1022                 last_gc = now;
1023
1024         if (goal <= 0) {
1025                 equilibrium += goal;
1026                 goto work_done;
1027         }
1028
1029         do {
1030                 int i, k;
1031
1032                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1033                         unsigned long tmo = expire;
1034
1035                         k = (k + 1) & rt_hash_mask;
1036                         rthp = &rt_hash_table[k].chain;
1037                         spin_lock_bh(rt_hash_lock_addr(k));
1038                         while ((rth = rcu_dereference_protected(*rthp,
1039                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1040                                 if (!rt_is_expired(rth) &&
1041                                         !rt_may_expire(rth, tmo, expire)) {
1042                                         tmo >>= 1;
1043                                         rthp = &rth->dst.rt_next;
1044                                         continue;
1045                                 }
1046                                 *rthp = rth->dst.rt_next;
1047                                 rt_free(rth);
1048                                 goal--;
1049                         }
1050                         spin_unlock_bh(rt_hash_lock_addr(k));
1051                         if (goal <= 0)
1052                                 break;
1053                 }
1054                 rover = k;
1055
1056                 if (goal <= 0)
1057                         goto work_done;
1058
1059                 /* Goal is not achieved. We stop process if:
1060
1061                    - if expire reduced to zero. Otherwise, expire is halfed.
1062                    - if table is not full.
1063                    - if we are called from interrupt.
1064                    - jiffies check is just fallback/debug loop breaker.
1065                      We will not spin here for long time in any case.
1066                  */
1067
1068                 RT_CACHE_STAT_INC(gc_goal_miss);
1069
1070                 if (expire == 0)
1071                         break;
1072
1073                 expire >>= 1;
1074
1075                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1076                         goto out;
1077         } while (!in_softirq() && time_before_eq(jiffies, now));
1078
1079         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1080                 goto out;
1081         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1082                 goto out;
1083         net_warn_ratelimited("dst cache overflow\n");
1084         RT_CACHE_STAT_INC(gc_dst_overflow);
1085         return 1;
1086
1087 work_done:
1088         expire += ip_rt_gc_min_interval;
1089         if (expire > ip_rt_gc_timeout ||
1090             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1091             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1092                 expire = ip_rt_gc_timeout;
1093 out:    return 0;
1094 }
1095
1096 /*
1097  * Returns number of entries in a hash chain that have different hash_inputs
1098  */
1099 static int slow_chain_length(const struct rtable *head)
1100 {
1101         int length = 0;
1102         const struct rtable *rth = head;
1103
1104         while (rth) {
1105                 length += has_noalias(head, rth);
1106                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1107         }
1108         return length >> FRACT_BITS;
1109 }
1110
1111 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1112 {
1113         static const __be32 inaddr_any = 0;
1114         struct net_device *dev = dst->dev;
1115         const __be32 *pkey = daddr;
1116         const struct rtable *rt;
1117         struct neighbour *n;
1118
1119         rt = (const struct rtable *) dst;
1120
1121         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1122                 pkey = &inaddr_any;
1123         else if (rt->rt_gateway)
1124                 pkey = (const __be32 *) &rt->rt_gateway;
1125
1126         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1127         if (n)
1128                 return n;
1129         return neigh_create(&arp_tbl, pkey, dev);
1130 }
1131
1132 static int rt_bind_neighbour(struct rtable *rt)
1133 {
1134         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1135         if (IS_ERR(n))
1136                 return PTR_ERR(n);
1137         dst_set_neighbour(&rt->dst, n);
1138
1139         return 0;
1140 }
1141
1142 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1143                                      struct sk_buff *skb, int ifindex)
1144 {
1145         struct rtable   *rth, *cand;
1146         struct rtable __rcu **rthp, **candp;
1147         unsigned long   now;
1148         u32             min_score;
1149         int             chain_length;
1150         int attempts = !in_softirq();
1151
1152 restart:
1153         chain_length = 0;
1154         min_score = ~(u32)0;
1155         cand = NULL;
1156         candp = NULL;
1157         now = jiffies;
1158
1159         if (!rt_caching(dev_net(rt->dst.dev))) {
1160                 /*
1161                  * If we're not caching, just tell the caller we
1162                  * were successful and don't touch the route.  The
1163                  * caller hold the sole reference to the cache entry, and
1164                  * it will be released when the caller is done with it.
1165                  * If we drop it here, the callers have no way to resolve routes
1166                  * when we're not caching.  Instead, just point *rp at rt, so
1167                  * the caller gets a single use out of the route
1168                  * Note that we do rt_free on this new route entry, so that
1169                  * once its refcount hits zero, we are still able to reap it
1170                  * (Thanks Alexey)
1171                  * Note: To avoid expensive rcu stuff for this uncached dst,
1172                  * we set DST_NOCACHE so that dst_release() can free dst without
1173                  * waiting a grace period.
1174                  */
1175
1176                 rt->dst.flags |= DST_NOCACHE;
1177                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1178                         int err = rt_bind_neighbour(rt);
1179                         if (err) {
1180                                 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1181                                 ip_rt_put(rt);
1182                                 return ERR_PTR(err);
1183                         }
1184                 }
1185
1186                 goto skip_hashing;
1187         }
1188
1189         rthp = &rt_hash_table[hash].chain;
1190
1191         spin_lock_bh(rt_hash_lock_addr(hash));
1192         while ((rth = rcu_dereference_protected(*rthp,
1193                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1194                 if (rt_is_expired(rth)) {
1195                         *rthp = rth->dst.rt_next;
1196                         rt_free(rth);
1197                         continue;
1198                 }
1199                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1200                         /* Put it first */
1201                         *rthp = rth->dst.rt_next;
1202                         /*
1203                          * Since lookup is lockfree, the deletion
1204                          * must be visible to another weakly ordered CPU before
1205                          * the insertion at the start of the hash chain.
1206                          */
1207                         rcu_assign_pointer(rth->dst.rt_next,
1208                                            rt_hash_table[hash].chain);
1209                         /*
1210                          * Since lookup is lockfree, the update writes
1211                          * must be ordered for consistency on SMP.
1212                          */
1213                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1214
1215                         dst_use(&rth->dst, now);
1216                         spin_unlock_bh(rt_hash_lock_addr(hash));
1217
1218                         rt_drop(rt);
1219                         if (skb)
1220                                 skb_dst_set(skb, &rth->dst);
1221                         return rth;
1222                 }
1223
1224                 if (!atomic_read(&rth->dst.__refcnt)) {
1225                         u32 score = rt_score(rth);
1226
1227                         if (score <= min_score) {
1228                                 cand = rth;
1229                                 candp = rthp;
1230                                 min_score = score;
1231                         }
1232                 }
1233
1234                 chain_length++;
1235
1236                 rthp = &rth->dst.rt_next;
1237         }
1238
1239         if (cand) {
1240                 /* ip_rt_gc_elasticity used to be average length of chain
1241                  * length, when exceeded gc becomes really aggressive.
1242                  *
1243                  * The second limit is less certain. At the moment it allows
1244                  * only 2 entries per bucket. We will see.
1245                  */
1246                 if (chain_length > ip_rt_gc_elasticity) {
1247                         *candp = cand->dst.rt_next;
1248                         rt_free(cand);
1249                 }
1250         } else {
1251                 if (chain_length > rt_chain_length_max &&
1252                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1253                         struct net *net = dev_net(rt->dst.dev);
1254                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1255                         if (!rt_caching(net)) {
1256                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1257                                         rt->dst.dev->name, num);
1258                         }
1259                         rt_emergency_hash_rebuild(net);
1260                         spin_unlock_bh(rt_hash_lock_addr(hash));
1261
1262                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1263                                         ifindex, rt_genid(net));
1264                         goto restart;
1265                 }
1266         }
1267
1268         /* Try to bind route to arp only if it is output
1269            route or unicast forwarding path.
1270          */
1271         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1272                 int err = rt_bind_neighbour(rt);
1273                 if (err) {
1274                         spin_unlock_bh(rt_hash_lock_addr(hash));
1275
1276                         if (err != -ENOBUFS) {
1277                                 rt_drop(rt);
1278                                 return ERR_PTR(err);
1279                         }
1280
1281                         /* Neighbour tables are full and nothing
1282                            can be released. Try to shrink route cache,
1283                            it is most likely it holds some neighbour records.
1284                          */
1285                         if (attempts-- > 0) {
1286                                 int saved_elasticity = ip_rt_gc_elasticity;
1287                                 int saved_int = ip_rt_gc_min_interval;
1288                                 ip_rt_gc_elasticity     = 1;
1289                                 ip_rt_gc_min_interval   = 0;
1290                                 rt_garbage_collect(&ipv4_dst_ops);
1291                                 ip_rt_gc_min_interval   = saved_int;
1292                                 ip_rt_gc_elasticity     = saved_elasticity;
1293                                 goto restart;
1294                         }
1295
1296                         net_warn_ratelimited("Neighbour table overflow\n");
1297                         rt_drop(rt);
1298                         return ERR_PTR(-ENOBUFS);
1299                 }
1300         }
1301
1302         rt->dst.rt_next = rt_hash_table[hash].chain;
1303
1304         /*
1305          * Since lookup is lockfree, we must make sure
1306          * previous writes to rt are committed to memory
1307          * before making rt visible to other CPUS.
1308          */
1309         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1310
1311         spin_unlock_bh(rt_hash_lock_addr(hash));
1312
1313 skip_hashing:
1314         if (skb)
1315                 skb_dst_set(skb, &rt->dst);
1316         return rt;
1317 }
1318
1319 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1320
1321 static u32 rt_peer_genid(void)
1322 {
1323         return atomic_read(&__rt_peer_genid);
1324 }
1325
1326 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1327 {
1328         struct net *net = dev_net(rt->dst.dev);
1329         struct inet_peer *peer;
1330
1331         peer = inet_getpeer_v4(net, daddr, create);
1332
1333         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1334                 inet_putpeer(peer);
1335         else
1336                 rt->rt_peer_genid = rt_peer_genid();
1337 }
1338
1339 /*
1340  * Peer allocation may fail only in serious out-of-memory conditions.  However
1341  * we still can generate some output.
1342  * Random ID selection looks a bit dangerous because we have no chances to
1343  * select ID being unique in a reasonable period of time.
1344  * But broken packet identifier may be better than no packet at all.
1345  */
1346 static void ip_select_fb_ident(struct iphdr *iph)
1347 {
1348         static DEFINE_SPINLOCK(ip_fb_id_lock);
1349         static u32 ip_fallback_id;
1350         u32 salt;
1351
1352         spin_lock_bh(&ip_fb_id_lock);
1353         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1354         iph->id = htons(salt & 0xFFFF);
1355         ip_fallback_id = salt;
1356         spin_unlock_bh(&ip_fb_id_lock);
1357 }
1358
1359 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1360 {
1361         struct rtable *rt = (struct rtable *) dst;
1362
1363         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1364                 struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
1365
1366                 /* If peer is attached to destination, it is never detached,
1367                    so that we need not to grab a lock to dereference it.
1368                  */
1369                 if (peer) {
1370                         iph->id = htons(inet_getid(peer, more));
1371                         return;
1372                 }
1373         } else if (!rt)
1374                 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1375
1376         ip_select_fb_ident(iph);
1377 }
1378 EXPORT_SYMBOL(__ip_select_ident);
1379
1380 static void rt_del(unsigned int hash, struct rtable *rt)
1381 {
1382         struct rtable __rcu **rthp;
1383         struct rtable *aux;
1384
1385         rthp = &rt_hash_table[hash].chain;
1386         spin_lock_bh(rt_hash_lock_addr(hash));
1387         ip_rt_put(rt);
1388         while ((aux = rcu_dereference_protected(*rthp,
1389                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1390                 if (aux == rt || rt_is_expired(aux)) {
1391                         *rthp = aux->dst.rt_next;
1392                         rt_free(aux);
1393                         continue;
1394                 }
1395                 rthp = &aux->dst.rt_next;
1396         }
1397         spin_unlock_bh(rt_hash_lock_addr(hash));
1398 }
1399
1400 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1401 {
1402         struct rtable *rt = (struct rtable *) dst;
1403         __be32 orig_gw = rt->rt_gateway;
1404         struct neighbour *n, *old_n;
1405
1406         dst_confirm(&rt->dst);
1407
1408         rt->rt_gateway = peer->redirect_learned.a4;
1409
1410         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1411         if (IS_ERR(n)) {
1412                 rt->rt_gateway = orig_gw;
1413                 return;
1414         }
1415         old_n = xchg(&rt->dst._neighbour, n);
1416         if (old_n)
1417                 neigh_release(old_n);
1418         if (!(n->nud_state & NUD_VALID)) {
1419                 neigh_event_send(n, NULL);
1420         } else {
1421                 rt->rt_flags |= RTCF_REDIRECTED;
1422                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1423         }
1424 }
1425
1426 /* called in rcu_read_lock() section */
1427 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1428                     __be32 saddr, struct net_device *dev)
1429 {
1430         int s, i;
1431         struct in_device *in_dev = __in_dev_get_rcu(dev);
1432         __be32 skeys[2] = { saddr, 0 };
1433         int    ikeys[2] = { dev->ifindex, 0 };
1434         struct inet_peer *peer;
1435         struct net *net;
1436
1437         if (!in_dev)
1438                 return;
1439
1440         net = dev_net(dev);
1441         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1442             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1443             ipv4_is_zeronet(new_gw))
1444                 goto reject_redirect;
1445
1446         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1447                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1448                         goto reject_redirect;
1449                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1450                         goto reject_redirect;
1451         } else {
1452                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1453                         goto reject_redirect;
1454         }
1455
1456         for (s = 0; s < 2; s++) {
1457                 for (i = 0; i < 2; i++) {
1458                         unsigned int hash;
1459                         struct rtable __rcu **rthp;
1460                         struct rtable *rt;
1461
1462                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1463
1464                         rthp = &rt_hash_table[hash].chain;
1465
1466                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1467                                 rthp = &rt->dst.rt_next;
1468
1469                                 if (rt->rt_key_dst != daddr ||
1470                                     rt->rt_key_src != skeys[s] ||
1471                                     rt->rt_oif != ikeys[i] ||
1472                                     rt_is_input_route(rt) ||
1473                                     rt_is_expired(rt) ||
1474                                     !net_eq(dev_net(rt->dst.dev), net) ||
1475                                     rt->dst.error ||
1476                                     rt->dst.dev != dev ||
1477                                     rt->rt_gateway != old_gw)
1478                                         continue;
1479
1480                                 peer = rt_get_peer_create(rt, rt->rt_dst);
1481                                 if (peer) {
1482                                         if (peer->redirect_learned.a4 != new_gw) {
1483                                                 peer->redirect_learned.a4 = new_gw;
1484                                                 atomic_inc(&__rt_peer_genid);
1485                                         }
1486                                         check_peer_redir(&rt->dst, peer);
1487                                 }
1488                         }
1489                 }
1490         }
1491         return;
1492
1493 reject_redirect:
1494 #ifdef CONFIG_IP_ROUTE_VERBOSE
1495         if (IN_DEV_LOG_MARTIANS(in_dev))
1496                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1497                                      "  Advised path = %pI4 -> %pI4\n",
1498                                      &old_gw, dev->name, &new_gw,
1499                                      &saddr, &daddr);
1500 #endif
1501         ;
1502 }
1503
1504 static bool peer_pmtu_expired(struct inet_peer *peer)
1505 {
1506         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1507
1508         return orig &&
1509                time_after_eq(jiffies, orig) &&
1510                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1511 }
1512
1513 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1514 {
1515         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1516
1517         return orig &&
1518                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1519 }
1520
1521 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1522 {
1523         struct rtable *rt = (struct rtable *)dst;
1524         struct dst_entry *ret = dst;
1525
1526         if (rt) {
1527                 if (dst->obsolete > 0) {
1528                         ip_rt_put(rt);
1529                         ret = NULL;
1530                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1531                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1532                                                 rt->rt_oif,
1533                                                 rt_genid(dev_net(dst->dev)));
1534                         rt_del(hash, rt);
1535                         ret = NULL;
1536                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1537                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1538                 }
1539         }
1540         return ret;
1541 }
1542
1543 /*
1544  * Algorithm:
1545  *      1. The first ip_rt_redirect_number redirects are sent
1546  *         with exponential backoff, then we stop sending them at all,
1547  *         assuming that the host ignores our redirects.
1548  *      2. If we did not see packets requiring redirects
1549  *         during ip_rt_redirect_silence, we assume that the host
1550  *         forgot redirected route and start to send redirects again.
1551  *
1552  * This algorithm is much cheaper and more intelligent than dumb load limiting
1553  * in icmp.c.
1554  *
1555  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1556  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1557  */
1558
1559 void ip_rt_send_redirect(struct sk_buff *skb)
1560 {
1561         struct rtable *rt = skb_rtable(skb);
1562         struct in_device *in_dev;
1563         struct inet_peer *peer;
1564         int log_martians;
1565
1566         rcu_read_lock();
1567         in_dev = __in_dev_get_rcu(rt->dst.dev);
1568         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1569                 rcu_read_unlock();
1570                 return;
1571         }
1572         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1573         rcu_read_unlock();
1574
1575         peer = rt_get_peer_create(rt, rt->rt_dst);
1576         if (!peer) {
1577                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1578                 return;
1579         }
1580
1581         /* No redirected packets during ip_rt_redirect_silence;
1582          * reset the algorithm.
1583          */
1584         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1585                 peer->rate_tokens = 0;
1586
1587         /* Too many ignored redirects; do not send anything
1588          * set dst.rate_last to the last seen redirected packet.
1589          */
1590         if (peer->rate_tokens >= ip_rt_redirect_number) {
1591                 peer->rate_last = jiffies;
1592                 return;
1593         }
1594
1595         /* Check for load limit; set rate_last to the latest sent
1596          * redirect.
1597          */
1598         if (peer->rate_tokens == 0 ||
1599             time_after(jiffies,
1600                        (peer->rate_last +
1601                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1602                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1603                 peer->rate_last = jiffies;
1604                 ++peer->rate_tokens;
1605 #ifdef CONFIG_IP_ROUTE_VERBOSE
1606                 if (log_martians &&
1607                     peer->rate_tokens == ip_rt_redirect_number)
1608                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1609                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1610                                              &rt->rt_dst, &rt->rt_gateway);
1611 #endif
1612         }
1613 }
1614
1615 static int ip_error(struct sk_buff *skb)
1616 {
1617         struct rtable *rt = skb_rtable(skb);
1618         struct inet_peer *peer;
1619         unsigned long now;
1620         bool send;
1621         int code;
1622
1623         switch (rt->dst.error) {
1624         case EINVAL:
1625         default:
1626                 goto out;
1627         case EHOSTUNREACH:
1628                 code = ICMP_HOST_UNREACH;
1629                 break;
1630         case ENETUNREACH:
1631                 code = ICMP_NET_UNREACH;
1632                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1633                                 IPSTATS_MIB_INNOROUTES);
1634                 break;
1635         case EACCES:
1636                 code = ICMP_PKT_FILTERED;
1637                 break;
1638         }
1639
1640         peer = rt_get_peer_create(rt, rt->rt_dst);
1641
1642         send = true;
1643         if (peer) {
1644                 now = jiffies;
1645                 peer->rate_tokens += now - peer->rate_last;
1646                 if (peer->rate_tokens > ip_rt_error_burst)
1647                         peer->rate_tokens = ip_rt_error_burst;
1648                 peer->rate_last = now;
1649                 if (peer->rate_tokens >= ip_rt_error_cost)
1650                         peer->rate_tokens -= ip_rt_error_cost;
1651                 else
1652                         send = false;
1653         }
1654         if (send)
1655                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1656
1657 out:    kfree_skb(skb);
1658         return 0;
1659 }
1660
1661 /*
1662  *      The last two values are not from the RFC but
1663  *      are needed for AMPRnet AX.25 paths.
1664  */
1665
1666 static const unsigned short mtu_plateau[] =
1667 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1668
1669 static inline unsigned short guess_mtu(unsigned short old_mtu)
1670 {
1671         int i;
1672
1673         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1674                 if (old_mtu > mtu_plateau[i])
1675                         return mtu_plateau[i];
1676         return 68;
1677 }
1678
1679 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1680                                  unsigned short new_mtu,
1681                                  struct net_device *dev)
1682 {
1683         unsigned short old_mtu = ntohs(iph->tot_len);
1684         unsigned short est_mtu = 0;
1685         struct inet_peer *peer;
1686
1687         peer = inet_getpeer_v4(net, iph->daddr, 1);
1688         if (peer) {
1689                 unsigned short mtu = new_mtu;
1690
1691                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1692                         /* BSD 4.2 derived systems incorrectly adjust
1693                          * tot_len by the IP header length, and report
1694                          * a zero MTU in the ICMP message.
1695                          */
1696                         if (mtu == 0 &&
1697                             old_mtu >= 68 + (iph->ihl << 2))
1698                                 old_mtu -= iph->ihl << 2;
1699                         mtu = guess_mtu(old_mtu);
1700                 }
1701
1702                 if (mtu < ip_rt_min_pmtu)
1703                         mtu = ip_rt_min_pmtu;
1704                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1705                         unsigned long pmtu_expires;
1706
1707                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1708                         if (!pmtu_expires)
1709                                 pmtu_expires = 1UL;
1710
1711                         est_mtu = mtu;
1712                         peer->pmtu_learned = mtu;
1713                         peer->pmtu_expires = pmtu_expires;
1714                         atomic_inc(&__rt_peer_genid);
1715                 }
1716
1717                 inet_putpeer(peer);
1718         }
1719         return est_mtu ? : new_mtu;
1720 }
1721
1722 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1723 {
1724         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1725
1726         if (!expires)
1727                 return;
1728         if (time_before(jiffies, expires)) {
1729                 u32 orig_dst_mtu = dst_mtu(dst);
1730                 if (peer->pmtu_learned < orig_dst_mtu) {
1731                         if (!peer->pmtu_orig)
1732                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1733                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1734                 }
1735         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1736                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1737 }
1738
1739 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1740 {
1741         struct rtable *rt = (struct rtable *) dst;
1742         struct inet_peer *peer;
1743
1744         dst_confirm(dst);
1745
1746         peer = rt_get_peer_create(rt, rt->rt_dst);
1747         if (peer) {
1748                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1749
1750                 if (mtu < ip_rt_min_pmtu)
1751                         mtu = ip_rt_min_pmtu;
1752                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1753
1754                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1755                         if (!pmtu_expires)
1756                                 pmtu_expires = 1UL;
1757
1758                         peer->pmtu_learned = mtu;
1759                         peer->pmtu_expires = pmtu_expires;
1760
1761                         atomic_inc(&__rt_peer_genid);
1762                         rt->rt_peer_genid = rt_peer_genid();
1763                 }
1764                 check_peer_pmtu(dst, peer);
1765         }
1766 }
1767
1768
1769 static void ipv4_validate_peer(struct rtable *rt)
1770 {
1771         if (rt->rt_peer_genid != rt_peer_genid()) {
1772                 struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
1773
1774                 if (peer) {
1775                         check_peer_pmtu(&rt->dst, peer);
1776
1777                         if (peer->redirect_learned.a4 &&
1778                             peer->redirect_learned.a4 != rt->rt_gateway)
1779                                 check_peer_redir(&rt->dst, peer);
1780                 }
1781
1782                 rt->rt_peer_genid = rt_peer_genid();
1783         }
1784 }
1785
1786 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1787 {
1788         struct rtable *rt = (struct rtable *) dst;
1789
1790         if (rt_is_expired(rt))
1791                 return NULL;
1792         ipv4_validate_peer(rt);
1793         return dst;
1794 }
1795
1796 static void ipv4_dst_destroy(struct dst_entry *dst)
1797 {
1798         struct rtable *rt = (struct rtable *) dst;
1799         struct inet_peer *peer = rt->peer;
1800
1801         if (rt->fi) {
1802                 fib_info_put(rt->fi);
1803                 rt->fi = NULL;
1804         }
1805         if (peer) {
1806                 rt->peer = NULL;
1807                 inet_putpeer(peer);
1808         }
1809 }
1810
1811
1812 static void ipv4_link_failure(struct sk_buff *skb)
1813 {
1814         struct rtable *rt;
1815
1816         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1817
1818         rt = skb_rtable(skb);
1819         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1820                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1821 }
1822
1823 static int ip_rt_bug(struct sk_buff *skb)
1824 {
1825         pr_debug("%s: %pI4 -> %pI4, %s\n",
1826                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1827                  skb->dev ? skb->dev->name : "?");
1828         kfree_skb(skb);
1829         WARN_ON(1);
1830         return 0;
1831 }
1832
1833 /*
1834    We do not cache source address of outgoing interface,
1835    because it is used only by IP RR, TS and SRR options,
1836    so that it out of fast path.
1837
1838    BTW remember: "addr" is allowed to be not aligned
1839    in IP options!
1840  */
1841
1842 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1843 {
1844         __be32 src;
1845
1846         if (rt_is_output_route(rt))
1847                 src = ip_hdr(skb)->saddr;
1848         else {
1849                 struct fib_result res;
1850                 struct flowi4 fl4;
1851                 struct iphdr *iph;
1852
1853                 iph = ip_hdr(skb);
1854
1855                 memset(&fl4, 0, sizeof(fl4));
1856                 fl4.daddr = iph->daddr;
1857                 fl4.saddr = iph->saddr;
1858                 fl4.flowi4_tos = RT_TOS(iph->tos);
1859                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1860                 fl4.flowi4_iif = skb->dev->ifindex;
1861                 fl4.flowi4_mark = skb->mark;
1862
1863                 rcu_read_lock();
1864                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1865                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1866                 else
1867                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1868                                         RT_SCOPE_UNIVERSE);
1869                 rcu_read_unlock();
1870         }
1871         memcpy(addr, &src, 4);
1872 }
1873
1874 #ifdef CONFIG_IP_ROUTE_CLASSID
1875 static void set_class_tag(struct rtable *rt, u32 tag)
1876 {
1877         if (!(rt->dst.tclassid & 0xFFFF))
1878                 rt->dst.tclassid |= tag & 0xFFFF;
1879         if (!(rt->dst.tclassid & 0xFFFF0000))
1880                 rt->dst.tclassid |= tag & 0xFFFF0000;
1881 }
1882 #endif
1883
1884 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1885 {
1886         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1887
1888         if (advmss == 0) {
1889                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1890                                ip_rt_min_advmss);
1891                 if (advmss > 65535 - 40)
1892                         advmss = 65535 - 40;
1893         }
1894         return advmss;
1895 }
1896
1897 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1898 {
1899         const struct rtable *rt = (const struct rtable *) dst;
1900         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1901
1902         if (mtu && rt_is_output_route(rt))
1903                 return mtu;
1904
1905         mtu = dst->dev->mtu;
1906
1907         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1908
1909                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1910                         mtu = 576;
1911         }
1912
1913         if (mtu > IP_MAX_MTU)
1914                 mtu = IP_MAX_MTU;
1915
1916         return mtu;
1917 }
1918
1919 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1920                             struct fib_info *fi)
1921 {
1922         struct net *net = dev_net(rt->dst.dev);
1923         struct inet_peer *peer;
1924         int create = 0;
1925
1926         /* If a peer entry exists for this destination, we must hook
1927          * it up in order to get at cached metrics.
1928          */
1929         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1930                 create = 1;
1931
1932         rt->peer = peer = inet_getpeer_v4(net, rt->rt_dst, create);
1933         if (peer) {
1934                 rt->rt_peer_genid = rt_peer_genid();
1935                 if (inet_metrics_new(peer))
1936                         memcpy(peer->metrics, fi->fib_metrics,
1937                                sizeof(u32) * RTAX_MAX);
1938                 dst_init_metrics(&rt->dst, peer->metrics, false);
1939
1940                 check_peer_pmtu(&rt->dst, peer);
1941
1942                 if (peer->redirect_learned.a4 &&
1943                     peer->redirect_learned.a4 != rt->rt_gateway) {
1944                         rt->rt_gateway = peer->redirect_learned.a4;
1945                         rt->rt_flags |= RTCF_REDIRECTED;
1946                 }
1947         } else {
1948                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1949                         rt->fi = fi;
1950                         atomic_inc(&fi->fib_clntref);
1951                 }
1952                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1953         }
1954 }
1955
1956 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1957                            const struct fib_result *res,
1958                            struct fib_info *fi, u16 type, u32 itag)
1959 {
1960         struct dst_entry *dst = &rt->dst;
1961
1962         if (fi) {
1963                 if (FIB_RES_GW(*res) &&
1964                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1965                         rt->rt_gateway = FIB_RES_GW(*res);
1966                 rt_init_metrics(rt, fl4, fi);
1967 #ifdef CONFIG_IP_ROUTE_CLASSID
1968                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1969 #endif
1970         }
1971
1972         if (dst_mtu(dst) > IP_MAX_MTU)
1973                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1974         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1975                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1976
1977 #ifdef CONFIG_IP_ROUTE_CLASSID
1978 #ifdef CONFIG_IP_MULTIPLE_TABLES
1979         set_class_tag(rt, fib_rules_tclass(res));
1980 #endif
1981         set_class_tag(rt, itag);
1982 #endif
1983 }
1984
1985 static struct rtable *rt_dst_alloc(struct net_device *dev,
1986                                    bool nopolicy, bool noxfrm)
1987 {
1988         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1989                          DST_HOST |
1990                          (nopolicy ? DST_NOPOLICY : 0) |
1991                          (noxfrm ? DST_NOXFRM : 0));
1992 }
1993
1994 /* called in rcu_read_lock() section */
1995 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1996                                 u8 tos, struct net_device *dev, int our)
1997 {
1998         unsigned int hash;
1999         struct rtable *rth;
2000         __be32 spec_dst;
2001         struct in_device *in_dev = __in_dev_get_rcu(dev);
2002         u32 itag = 0;
2003         int err;
2004
2005         /* Primary sanity checks. */
2006
2007         if (in_dev == NULL)
2008                 return -EINVAL;
2009
2010         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2011             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2012                 goto e_inval;
2013
2014         if (ipv4_is_zeronet(saddr)) {
2015                 if (!ipv4_is_local_multicast(daddr))
2016                         goto e_inval;
2017                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2018         } else {
2019                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2020                                           &itag);
2021                 if (err < 0)
2022                         goto e_err;
2023         }
2024         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2025                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2026         if (!rth)
2027                 goto e_nobufs;
2028
2029 #ifdef CONFIG_IP_ROUTE_CLASSID
2030         rth->dst.tclassid = itag;
2031 #endif
2032         rth->dst.output = ip_rt_bug;
2033
2034         rth->rt_key_dst = daddr;
2035         rth->rt_key_src = saddr;
2036         rth->rt_genid   = rt_genid(dev_net(dev));
2037         rth->rt_flags   = RTCF_MULTICAST;
2038         rth->rt_type    = RTN_MULTICAST;
2039         rth->rt_key_tos = tos;
2040         rth->rt_dst     = daddr;
2041         rth->rt_src     = saddr;
2042         rth->rt_route_iif = dev->ifindex;
2043         rth->rt_iif     = dev->ifindex;
2044         rth->rt_oif     = 0;
2045         rth->rt_mark    = skb->mark;
2046         rth->rt_gateway = daddr;
2047         rth->rt_spec_dst= spec_dst;
2048         rth->rt_peer_genid = 0;
2049         rth->peer = NULL;
2050         rth->fi = NULL;
2051         if (our) {
2052                 rth->dst.input= ip_local_deliver;
2053                 rth->rt_flags |= RTCF_LOCAL;
2054         }
2055
2056 #ifdef CONFIG_IP_MROUTE
2057         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2058                 rth->dst.input = ip_mr_input;
2059 #endif
2060         RT_CACHE_STAT_INC(in_slow_mc);
2061
2062         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2063         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2064         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2065
2066 e_nobufs:
2067         return -ENOBUFS;
2068 e_inval:
2069         return -EINVAL;
2070 e_err:
2071         return err;
2072 }
2073
2074
2075 static void ip_handle_martian_source(struct net_device *dev,
2076                                      struct in_device *in_dev,
2077                                      struct sk_buff *skb,
2078                                      __be32 daddr,
2079                                      __be32 saddr)
2080 {
2081         RT_CACHE_STAT_INC(in_martian_src);
2082 #ifdef CONFIG_IP_ROUTE_VERBOSE
2083         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2084                 /*
2085                  *      RFC1812 recommendation, if source is martian,
2086                  *      the only hint is MAC header.
2087                  */
2088                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2089                         &daddr, &saddr, dev->name);
2090                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2091                         print_hex_dump(KERN_WARNING, "ll header: ",
2092                                        DUMP_PREFIX_OFFSET, 16, 1,
2093                                        skb_mac_header(skb),
2094                                        dev->hard_header_len, true);
2095                 }
2096         }
2097 #endif
2098 }
2099
2100 /* called in rcu_read_lock() section */
2101 static int __mkroute_input(struct sk_buff *skb,
2102                            const struct fib_result *res,
2103                            struct in_device *in_dev,
2104                            __be32 daddr, __be32 saddr, u32 tos,
2105                            struct rtable **result)
2106 {
2107         struct rtable *rth;
2108         int err;
2109         struct in_device *out_dev;
2110         unsigned int flags = 0;
2111         __be32 spec_dst;
2112         u32 itag;
2113
2114         /* get a working reference to the output device */
2115         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2116         if (out_dev == NULL) {
2117                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2118                 return -EINVAL;
2119         }
2120
2121
2122         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2123                                   in_dev->dev, &spec_dst, &itag);
2124         if (err < 0) {
2125                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2126                                          saddr);
2127
2128                 goto cleanup;
2129         }
2130
2131         if (err)
2132                 flags |= RTCF_DIRECTSRC;
2133
2134         if (out_dev == in_dev && err &&
2135             (IN_DEV_SHARED_MEDIA(out_dev) ||
2136              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2137                 flags |= RTCF_DOREDIRECT;
2138
2139         if (skb->protocol != htons(ETH_P_IP)) {
2140                 /* Not IP (i.e. ARP). Do not create route, if it is
2141                  * invalid for proxy arp. DNAT routes are always valid.
2142                  *
2143                  * Proxy arp feature have been extended to allow, ARP
2144                  * replies back to the same interface, to support
2145                  * Private VLAN switch technologies. See arp.c.
2146                  */
2147                 if (out_dev == in_dev &&
2148                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2149                         err = -EINVAL;
2150                         goto cleanup;
2151                 }
2152         }
2153
2154         rth = rt_dst_alloc(out_dev->dev,
2155                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2156                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2157         if (!rth) {
2158                 err = -ENOBUFS;
2159                 goto cleanup;
2160         }
2161
2162         rth->rt_key_dst = daddr;
2163         rth->rt_key_src = saddr;
2164         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2165         rth->rt_flags = flags;
2166         rth->rt_type = res->type;
2167         rth->rt_key_tos = tos;
2168         rth->rt_dst     = daddr;
2169         rth->rt_src     = saddr;
2170         rth->rt_route_iif = in_dev->dev->ifindex;
2171         rth->rt_iif     = in_dev->dev->ifindex;
2172         rth->rt_oif     = 0;
2173         rth->rt_mark    = skb->mark;
2174         rth->rt_gateway = daddr;
2175         rth->rt_spec_dst= spec_dst;
2176         rth->rt_peer_genid = 0;
2177         rth->peer = NULL;
2178         rth->fi = NULL;
2179
2180         rth->dst.input = ip_forward;
2181         rth->dst.output = ip_output;
2182
2183         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2184
2185         *result = rth;
2186         err = 0;
2187  cleanup:
2188         return err;
2189 }
2190
2191 static int ip_mkroute_input(struct sk_buff *skb,
2192                             struct fib_result *res,
2193                             const struct flowi4 *fl4,
2194                             struct in_device *in_dev,
2195                             __be32 daddr, __be32 saddr, u32 tos)
2196 {
2197         struct rtable *rth = NULL;
2198         int err;
2199         unsigned int hash;
2200
2201 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2202         if (res->fi && res->fi->fib_nhs > 1)
2203                 fib_select_multipath(res);
2204 #endif
2205
2206         /* create a routing cache entry */
2207         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2208         if (err)
2209                 return err;
2210
2211         /* put it into the cache */
2212         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2213                        rt_genid(dev_net(rth->dst.dev)));
2214         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2215         if (IS_ERR(rth))
2216                 return PTR_ERR(rth);
2217         return 0;
2218 }
2219
2220 /*
2221  *      NOTE. We drop all the packets that has local source
2222  *      addresses, because every properly looped back packet
2223  *      must have correct destination already attached by output routine.
2224  *
2225  *      Such approach solves two big problems:
2226  *      1. Not simplex devices are handled properly.
2227  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2228  *      called with rcu_read_lock()
2229  */
2230
2231 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2232                                u8 tos, struct net_device *dev)
2233 {
2234         struct fib_result res;
2235         struct in_device *in_dev = __in_dev_get_rcu(dev);
2236         struct flowi4   fl4;
2237         unsigned int    flags = 0;
2238         u32             itag = 0;
2239         struct rtable   *rth;
2240         unsigned int    hash;
2241         __be32          spec_dst;
2242         int             err = -EINVAL;
2243         struct net    *net = dev_net(dev);
2244
2245         /* IP on this device is disabled. */
2246
2247         if (!in_dev)
2248                 goto out;
2249
2250         /* Check for the most weird martians, which can be not detected
2251            by fib_lookup.
2252          */
2253
2254         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2255             ipv4_is_loopback(saddr))
2256                 goto martian_source;
2257
2258         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2259                 goto brd_input;
2260
2261         /* Accept zero addresses only to limited broadcast;
2262          * I even do not know to fix it or not. Waiting for complains :-)
2263          */
2264         if (ipv4_is_zeronet(saddr))
2265                 goto martian_source;
2266
2267         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2268                 goto martian_destination;
2269
2270         /*
2271          *      Now we are ready to route packet.
2272          */
2273         fl4.flowi4_oif = 0;
2274         fl4.flowi4_iif = dev->ifindex;
2275         fl4.flowi4_mark = skb->mark;
2276         fl4.flowi4_tos = tos;
2277         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2278         fl4.daddr = daddr;
2279         fl4.saddr = saddr;
2280         err = fib_lookup(net, &fl4, &res);
2281         if (err != 0) {
2282                 if (!IN_DEV_FORWARD(in_dev))
2283                         goto e_hostunreach;
2284                 goto no_route;
2285         }
2286
2287         RT_CACHE_STAT_INC(in_slow_tot);
2288
2289         if (res.type == RTN_BROADCAST)
2290                 goto brd_input;
2291
2292         if (res.type == RTN_LOCAL) {
2293                 err = fib_validate_source(skb, saddr, daddr, tos,
2294                                           net->loopback_dev->ifindex,
2295                                           dev, &spec_dst, &itag);
2296                 if (err < 0)
2297                         goto martian_source_keep_err;
2298                 if (err)
2299                         flags |= RTCF_DIRECTSRC;
2300                 spec_dst = daddr;
2301                 goto local_input;
2302         }
2303
2304         if (!IN_DEV_FORWARD(in_dev))
2305                 goto e_hostunreach;
2306         if (res.type != RTN_UNICAST)
2307                 goto martian_destination;
2308
2309         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2310 out:    return err;
2311
2312 brd_input:
2313         if (skb->protocol != htons(ETH_P_IP))
2314                 goto e_inval;
2315
2316         if (ipv4_is_zeronet(saddr))
2317                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2318         else {
2319                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2320                                           &itag);
2321                 if (err < 0)
2322                         goto martian_source_keep_err;
2323                 if (err)
2324                         flags |= RTCF_DIRECTSRC;
2325         }
2326         flags |= RTCF_BROADCAST;
2327         res.type = RTN_BROADCAST;
2328         RT_CACHE_STAT_INC(in_brd);
2329
2330 local_input:
2331         rth = rt_dst_alloc(net->loopback_dev,
2332                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2333         if (!rth)
2334                 goto e_nobufs;
2335
2336         rth->dst.input= ip_local_deliver;
2337         rth->dst.output= ip_rt_bug;
2338 #ifdef CONFIG_IP_ROUTE_CLASSID
2339         rth->dst.tclassid = itag;
2340 #endif
2341
2342         rth->rt_key_dst = daddr;
2343         rth->rt_key_src = saddr;
2344         rth->rt_genid = rt_genid(net);
2345         rth->rt_flags   = flags|RTCF_LOCAL;
2346         rth->rt_type    = res.type;
2347         rth->rt_key_tos = tos;
2348         rth->rt_dst     = daddr;
2349         rth->rt_src     = saddr;
2350 #ifdef CONFIG_IP_ROUTE_CLASSID
2351         rth->dst.tclassid = itag;
2352 #endif
2353         rth->rt_route_iif = dev->ifindex;
2354         rth->rt_iif     = dev->ifindex;
2355         rth->rt_oif     = 0;
2356         rth->rt_mark    = skb->mark;
2357         rth->rt_gateway = daddr;
2358         rth->rt_spec_dst= spec_dst;
2359         rth->rt_peer_genid = 0;
2360         rth->peer = NULL;
2361         rth->fi = NULL;
2362         if (res.type == RTN_UNREACHABLE) {
2363                 rth->dst.input= ip_error;
2364                 rth->dst.error= -err;
2365                 rth->rt_flags   &= ~RTCF_LOCAL;
2366         }
2367         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2368         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2369         err = 0;
2370         if (IS_ERR(rth))
2371                 err = PTR_ERR(rth);
2372         goto out;
2373
2374 no_route:
2375         RT_CACHE_STAT_INC(in_no_route);
2376         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2377         res.type = RTN_UNREACHABLE;
2378         if (err == -ESRCH)
2379                 err = -ENETUNREACH;
2380         goto local_input;
2381
2382         /*
2383          *      Do not cache martian addresses: they should be logged (RFC1812)
2384          */
2385 martian_destination:
2386         RT_CACHE_STAT_INC(in_martian_dst);
2387 #ifdef CONFIG_IP_ROUTE_VERBOSE
2388         if (IN_DEV_LOG_MARTIANS(in_dev))
2389                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2390                                      &daddr, &saddr, dev->name);
2391 #endif
2392
2393 e_hostunreach:
2394         err = -EHOSTUNREACH;
2395         goto out;
2396
2397 e_inval:
2398         err = -EINVAL;
2399         goto out;
2400
2401 e_nobufs:
2402         err = -ENOBUFS;
2403         goto out;
2404
2405 martian_source:
2406         err = -EINVAL;
2407 martian_source_keep_err:
2408         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2409         goto out;
2410 }
2411
2412 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2413                            u8 tos, struct net_device *dev, bool noref)
2414 {
2415         struct rtable   *rth;
2416         unsigned int    hash;
2417         int iif = dev->ifindex;
2418         struct net *net;
2419         int res;
2420
2421         net = dev_net(dev);
2422
2423         rcu_read_lock();
2424
2425         if (!rt_caching(net))
2426                 goto skip_cache;
2427
2428         tos &= IPTOS_RT_MASK;
2429         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2430
2431         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2432              rth = rcu_dereference(rth->dst.rt_next)) {
2433                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2434                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2435                      (rth->rt_route_iif ^ iif) |
2436                      (rth->rt_key_tos ^ tos)) == 0 &&
2437                     rth->rt_mark == skb->mark &&
2438                     net_eq(dev_net(rth->dst.dev), net) &&
2439                     !rt_is_expired(rth)) {
2440                         ipv4_validate_peer(rth);
2441                         if (noref) {
2442                                 dst_use_noref(&rth->dst, jiffies);
2443                                 skb_dst_set_noref(skb, &rth->dst);
2444                         } else {
2445                                 dst_use(&rth->dst, jiffies);
2446                                 skb_dst_set(skb, &rth->dst);
2447                         }
2448                         RT_CACHE_STAT_INC(in_hit);
2449                         rcu_read_unlock();
2450                         return 0;
2451                 }
2452                 RT_CACHE_STAT_INC(in_hlist_search);
2453         }
2454
2455 skip_cache:
2456         /* Multicast recognition logic is moved from route cache to here.
2457            The problem was that too many Ethernet cards have broken/missing
2458            hardware multicast filters :-( As result the host on multicasting
2459            network acquires a lot of useless route cache entries, sort of
2460            SDR messages from all the world. Now we try to get rid of them.
2461            Really, provided software IP multicast filter is organized
2462            reasonably (at least, hashed), it does not result in a slowdown
2463            comparing with route cache reject entries.
2464            Note, that multicast routers are not affected, because
2465            route cache entry is created eventually.
2466          */
2467         if (ipv4_is_multicast(daddr)) {
2468                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2469
2470                 if (in_dev) {
2471                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2472                                                   ip_hdr(skb)->protocol);
2473                         if (our
2474 #ifdef CONFIG_IP_MROUTE
2475                                 ||
2476                             (!ipv4_is_local_multicast(daddr) &&
2477                              IN_DEV_MFORWARD(in_dev))
2478 #endif
2479                            ) {
2480                                 int res = ip_route_input_mc(skb, daddr, saddr,
2481                                                             tos, dev, our);
2482                                 rcu_read_unlock();
2483                                 return res;
2484                         }
2485                 }
2486                 rcu_read_unlock();
2487                 return -EINVAL;
2488         }
2489         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2490         rcu_read_unlock();
2491         return res;
2492 }
2493 EXPORT_SYMBOL(ip_route_input_common);
2494
2495 /* called with rcu_read_lock() */
2496 static struct rtable *__mkroute_output(const struct fib_result *res,
2497                                        const struct flowi4 *fl4,
2498                                        __be32 orig_daddr, __be32 orig_saddr,
2499                                        int orig_oif, __u8 orig_rtos,
2500                                        struct net_device *dev_out,
2501                                        unsigned int flags)
2502 {
2503         struct fib_info *fi = res->fi;
2504         struct in_device *in_dev;
2505         u16 type = res->type;
2506         struct rtable *rth;
2507
2508         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2509                 return ERR_PTR(-EINVAL);
2510
2511         if (ipv4_is_lbcast(fl4->daddr))
2512                 type = RTN_BROADCAST;
2513         else if (ipv4_is_multicast(fl4->daddr))
2514                 type = RTN_MULTICAST;
2515         else if (ipv4_is_zeronet(fl4->daddr))
2516                 return ERR_PTR(-EINVAL);
2517
2518         if (dev_out->flags & IFF_LOOPBACK)
2519                 flags |= RTCF_LOCAL;
2520
2521         in_dev = __in_dev_get_rcu(dev_out);
2522         if (!in_dev)
2523                 return ERR_PTR(-EINVAL);
2524
2525         if (type == RTN_BROADCAST) {
2526                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2527                 fi = NULL;
2528         } else if (type == RTN_MULTICAST) {
2529                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2530                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2531                                      fl4->flowi4_proto))
2532                         flags &= ~RTCF_LOCAL;
2533                 /* If multicast route do not exist use
2534                  * default one, but do not gateway in this case.
2535                  * Yes, it is hack.
2536                  */
2537                 if (fi && res->prefixlen < 4)
2538                         fi = NULL;
2539         }
2540
2541         rth = rt_dst_alloc(dev_out,
2542                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2543                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2544         if (!rth)
2545                 return ERR_PTR(-ENOBUFS);
2546
2547         rth->dst.output = ip_output;
2548
2549         rth->rt_key_dst = orig_daddr;
2550         rth->rt_key_src = orig_saddr;
2551         rth->rt_genid = rt_genid(dev_net(dev_out));
2552         rth->rt_flags   = flags;
2553         rth->rt_type    = type;
2554         rth->rt_key_tos = orig_rtos;
2555         rth->rt_dst     = fl4->daddr;
2556         rth->rt_src     = fl4->saddr;
2557         rth->rt_route_iif = 0;
2558         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2559         rth->rt_oif     = orig_oif;
2560         rth->rt_mark    = fl4->flowi4_mark;
2561         rth->rt_gateway = fl4->daddr;
2562         rth->rt_spec_dst= fl4->saddr;
2563         rth->rt_peer_genid = 0;
2564         rth->peer = NULL;
2565         rth->fi = NULL;
2566
2567         RT_CACHE_STAT_INC(out_slow_tot);
2568
2569         if (flags & RTCF_LOCAL) {
2570                 rth->dst.input = ip_local_deliver;
2571                 rth->rt_spec_dst = fl4->daddr;
2572         }
2573         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2574                 rth->rt_spec_dst = fl4->saddr;
2575                 if (flags & RTCF_LOCAL &&
2576                     !(dev_out->flags & IFF_LOOPBACK)) {
2577                         rth->dst.output = ip_mc_output;
2578                         RT_CACHE_STAT_INC(out_slow_mc);
2579                 }
2580 #ifdef CONFIG_IP_MROUTE
2581                 if (type == RTN_MULTICAST) {
2582                         if (IN_DEV_MFORWARD(in_dev) &&
2583                             !ipv4_is_local_multicast(fl4->daddr)) {
2584                                 rth->dst.input = ip_mr_input;
2585                                 rth->dst.output = ip_mc_output;
2586                         }
2587                 }
2588 #endif
2589         }
2590
2591         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2592
2593         return rth;
2594 }
2595
2596 /*
2597  * Major route resolver routine.
2598  * called with rcu_read_lock();
2599  */
2600
2601 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2602 {
2603         struct net_device *dev_out = NULL;
2604         __u8 tos = RT_FL_TOS(fl4);
2605         unsigned int flags = 0;
2606         struct fib_result res;
2607         struct rtable *rth;
2608         __be32 orig_daddr;
2609         __be32 orig_saddr;
2610         int orig_oif;
2611
2612         res.fi          = NULL;
2613 #ifdef CONFIG_IP_MULTIPLE_TABLES
2614         res.r           = NULL;
2615 #endif
2616
2617         orig_daddr = fl4->daddr;
2618         orig_saddr = fl4->saddr;
2619         orig_oif = fl4->flowi4_oif;
2620
2621         fl4->flowi4_iif = net->loopback_dev->ifindex;
2622         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2623         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2624                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2625
2626         rcu_read_lock();
2627         if (fl4->saddr) {
2628                 rth = ERR_PTR(-EINVAL);
2629                 if (ipv4_is_multicast(fl4->saddr) ||
2630                     ipv4_is_lbcast(fl4->saddr) ||
2631                     ipv4_is_zeronet(fl4->saddr))
2632                         goto out;
2633
2634                 /* I removed check for oif == dev_out->oif here.
2635                    It was wrong for two reasons:
2636                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2637                       is assigned to multiple interfaces.
2638                    2. Moreover, we are allowed to send packets with saddr
2639                       of another iface. --ANK
2640                  */
2641
2642                 if (fl4->flowi4_oif == 0 &&
2643                     (ipv4_is_multicast(fl4->daddr) ||
2644                      ipv4_is_lbcast(fl4->daddr))) {
2645                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2646                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2647                         if (dev_out == NULL)
2648                                 goto out;
2649
2650                         /* Special hack: user can direct multicasts
2651                            and limited broadcast via necessary interface
2652                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2653                            This hack is not just for fun, it allows
2654                            vic,vat and friends to work.
2655                            They bind socket to loopback, set ttl to zero
2656                            and expect that it will work.
2657                            From the viewpoint of routing cache they are broken,
2658                            because we are not allowed to build multicast path
2659                            with loopback source addr (look, routing cache
2660                            cannot know, that ttl is zero, so that packet
2661                            will not leave this host and route is valid).
2662                            Luckily, this hack is good workaround.
2663                          */
2664
2665                         fl4->flowi4_oif = dev_out->ifindex;
2666                         goto make_route;
2667                 }
2668
2669                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2670                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2671                         if (!__ip_dev_find(net, fl4->saddr, false))
2672                                 goto out;
2673                 }
2674         }
2675
2676
2677         if (fl4->flowi4_oif) {
2678                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2679                 rth = ERR_PTR(-ENODEV);
2680                 if (dev_out == NULL)
2681                         goto out;
2682
2683                 /* RACE: Check return value of inet_select_addr instead. */
2684                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2685                         rth = ERR_PTR(-ENETUNREACH);
2686                         goto out;
2687                 }
2688                 if (ipv4_is_local_multicast(fl4->daddr) ||
2689                     ipv4_is_lbcast(fl4->daddr)) {
2690                         if (!fl4->saddr)
2691                                 fl4->saddr = inet_select_addr(dev_out, 0,
2692                                                               RT_SCOPE_LINK);
2693                         goto make_route;
2694                 }
2695                 if (fl4->saddr) {
2696                         if (ipv4_is_multicast(fl4->daddr))
2697                                 fl4->saddr = inet_select_addr(dev_out, 0,
2698                                                               fl4->flowi4_scope);
2699                         else if (!fl4->daddr)
2700                                 fl4->saddr = inet_select_addr(dev_out, 0,
2701                                                               RT_SCOPE_HOST);
2702                 }
2703         }
2704
2705         if (!fl4->daddr) {
2706                 fl4->daddr = fl4->saddr;
2707                 if (!fl4->daddr)
2708                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2709                 dev_out = net->loopback_dev;
2710                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2711                 res.type = RTN_LOCAL;
2712                 flags |= RTCF_LOCAL;
2713                 goto make_route;
2714         }
2715
2716         if (fib_lookup(net, fl4, &res)) {
2717                 res.fi = NULL;
2718                 if (fl4->flowi4_oif) {
2719                         /* Apparently, routing tables are wrong. Assume,
2720                            that the destination is on link.
2721
2722                            WHY? DW.
2723                            Because we are allowed to send to iface
2724                            even if it has NO routes and NO assigned
2725                            addresses. When oif is specified, routing
2726                            tables are looked up with only one purpose:
2727                            to catch if destination is gatewayed, rather than
2728                            direct. Moreover, if MSG_DONTROUTE is set,
2729                            we send packet, ignoring both routing tables
2730                            and ifaddr state. --ANK
2731
2732
2733                            We could make it even if oif is unknown,
2734                            likely IPv6, but we do not.
2735                          */
2736
2737                         if (fl4->saddr == 0)
2738                                 fl4->saddr = inet_select_addr(dev_out, 0,
2739                                                               RT_SCOPE_LINK);
2740                         res.type = RTN_UNICAST;
2741                         goto make_route;
2742                 }
2743                 rth = ERR_PTR(-ENETUNREACH);
2744                 goto out;
2745         }
2746
2747         if (res.type == RTN_LOCAL) {
2748                 if (!fl4->saddr) {
2749                         if (res.fi->fib_prefsrc)
2750                                 fl4->saddr = res.fi->fib_prefsrc;
2751                         else
2752                                 fl4->saddr = fl4->daddr;
2753                 }
2754                 dev_out = net->loopback_dev;
2755                 fl4->flowi4_oif = dev_out->ifindex;
2756                 res.fi = NULL;
2757                 flags |= RTCF_LOCAL;
2758                 goto make_route;
2759         }
2760
2761 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2762         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2763                 fib_select_multipath(&res);
2764         else
2765 #endif
2766         if (!res.prefixlen &&
2767             res.table->tb_num_default > 1 &&
2768             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2769                 fib_select_default(&res);
2770
2771         if (!fl4->saddr)
2772                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2773
2774         dev_out = FIB_RES_DEV(res);
2775         fl4->flowi4_oif = dev_out->ifindex;
2776
2777
2778 make_route:
2779         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2780                                tos, dev_out, flags);
2781         if (!IS_ERR(rth)) {
2782                 unsigned int hash;
2783
2784                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2785                                rt_genid(dev_net(dev_out)));
2786                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2787         }
2788
2789 out:
2790         rcu_read_unlock();
2791         return rth;
2792 }
2793
2794 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2795 {
2796         struct rtable *rth;
2797         unsigned int hash;
2798
2799         if (!rt_caching(net))
2800                 goto slow_output;
2801
2802         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2803
2804         rcu_read_lock_bh();
2805         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2806                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2807                 if (rth->rt_key_dst == flp4->daddr &&
2808                     rth->rt_key_src == flp4->saddr &&
2809                     rt_is_output_route(rth) &&
2810                     rth->rt_oif == flp4->flowi4_oif &&
2811                     rth->rt_mark == flp4->flowi4_mark &&
2812                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2813                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2814                     net_eq(dev_net(rth->dst.dev), net) &&
2815                     !rt_is_expired(rth)) {
2816                         ipv4_validate_peer(rth);
2817                         dst_use(&rth->dst, jiffies);
2818                         RT_CACHE_STAT_INC(out_hit);
2819                         rcu_read_unlock_bh();
2820                         if (!flp4->saddr)
2821                                 flp4->saddr = rth->rt_src;
2822                         if (!flp4->daddr)
2823                                 flp4->daddr = rth->rt_dst;
2824                         return rth;
2825                 }
2826                 RT_CACHE_STAT_INC(out_hlist_search);
2827         }
2828         rcu_read_unlock_bh();
2829
2830 slow_output:
2831         return ip_route_output_slow(net, flp4);
2832 }
2833 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2834
2835 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2836 {
2837         return NULL;
2838 }
2839
2840 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2841 {
2842         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2843
2844         return mtu ? : dst->dev->mtu;
2845 }
2846
2847 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2848 {
2849 }
2850
2851 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2852                                           unsigned long old)
2853 {
2854         return NULL;
2855 }
2856
2857 static struct dst_ops ipv4_dst_blackhole_ops = {
2858         .family                 =       AF_INET,
2859         .protocol               =       cpu_to_be16(ETH_P_IP),
2860         .destroy                =       ipv4_dst_destroy,
2861         .check                  =       ipv4_blackhole_dst_check,
2862         .mtu                    =       ipv4_blackhole_mtu,
2863         .default_advmss         =       ipv4_default_advmss,
2864         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2865         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2866         .neigh_lookup           =       ipv4_neigh_lookup,
2867 };
2868
2869 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2870 {
2871         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2872         struct rtable *ort = (struct rtable *) dst_orig;
2873
2874         if (rt) {
2875                 struct dst_entry *new = &rt->dst;
2876
2877                 new->__use = 1;
2878                 new->input = dst_discard;
2879                 new->output = dst_discard;
2880                 dst_copy_metrics(new, &ort->dst);
2881
2882                 new->dev = ort->dst.dev;
2883                 if (new->dev)
2884                         dev_hold(new->dev);
2885
2886                 rt->rt_key_dst = ort->rt_key_dst;
2887                 rt->rt_key_src = ort->rt_key_src;
2888                 rt->rt_key_tos = ort->rt_key_tos;
2889                 rt->rt_route_iif = ort->rt_route_iif;
2890                 rt->rt_iif = ort->rt_iif;
2891                 rt->rt_oif = ort->rt_oif;
2892                 rt->rt_mark = ort->rt_mark;
2893
2894                 rt->rt_genid = rt_genid(net);
2895                 rt->rt_flags = ort->rt_flags;
2896                 rt->rt_type = ort->rt_type;
2897                 rt->rt_dst = ort->rt_dst;
2898                 rt->rt_src = ort->rt_src;
2899                 rt->rt_gateway = ort->rt_gateway;
2900                 rt->rt_spec_dst = ort->rt_spec_dst;
2901                 rt->peer = ort->peer;
2902                 if (rt->peer)
2903                         atomic_inc(&rt->peer->refcnt);
2904                 rt->fi = ort->fi;
2905                 if (rt->fi)
2906                         atomic_inc(&rt->fi->fib_clntref);
2907
2908                 dst_free(new);
2909         }
2910
2911         dst_release(dst_orig);
2912
2913         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2914 }
2915
2916 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2917                                     struct sock *sk)
2918 {
2919         struct rtable *rt = __ip_route_output_key(net, flp4);
2920
2921         if (IS_ERR(rt))
2922                 return rt;
2923
2924         if (flp4->flowi4_proto)
2925                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2926                                                    flowi4_to_flowi(flp4),
2927                                                    sk, 0);
2928
2929         return rt;
2930 }
2931 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2932
2933 static int rt_fill_info(struct net *net,
2934                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2935                         int nowait, unsigned int flags)
2936 {
2937         struct rtable *rt = skb_rtable(skb);
2938         struct rtmsg *r;
2939         struct nlmsghdr *nlh;
2940         unsigned long expires = 0;
2941         const struct inet_peer *peer = rt->peer;
2942         u32 id = 0, ts = 0, tsage = 0, error;
2943
2944         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2945         if (nlh == NULL)
2946                 return -EMSGSIZE;
2947
2948         r = nlmsg_data(nlh);
2949         r->rtm_family    = AF_INET;
2950         r->rtm_dst_len  = 32;
2951         r->rtm_src_len  = 0;
2952         r->rtm_tos      = rt->rt_key_tos;
2953         r->rtm_table    = RT_TABLE_MAIN;
2954         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2955                 goto nla_put_failure;
2956         r->rtm_type     = rt->rt_type;
2957         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2958         r->rtm_protocol = RTPROT_UNSPEC;
2959         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2960         if (rt->rt_flags & RTCF_NOTIFY)
2961                 r->rtm_flags |= RTM_F_NOTIFY;
2962
2963         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2964                 goto nla_put_failure;
2965         if (rt->rt_key_src) {
2966                 r->rtm_src_len = 32;
2967                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2968                         goto nla_put_failure;
2969         }
2970         if (rt->dst.dev &&
2971             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2972                 goto nla_put_failure;
2973 #ifdef CONFIG_IP_ROUTE_CLASSID
2974         if (rt->dst.tclassid &&
2975             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2976                 goto nla_put_failure;
2977 #endif
2978         if (rt_is_input_route(rt)) {
2979                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
2980                         goto nla_put_failure;
2981         } else if (rt->rt_src != rt->rt_key_src) {
2982                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2983                         goto nla_put_failure;
2984         }
2985         if (rt->rt_dst != rt->rt_gateway &&
2986             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2987                 goto nla_put_failure;
2988
2989         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2990                 goto nla_put_failure;
2991
2992         if (rt->rt_mark &&
2993             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2994                 goto nla_put_failure;
2995
2996         error = rt->dst.error;
2997         if (peer) {
2998                 inet_peer_refcheck(rt->peer);
2999                 id = atomic_read(&peer->ip_id_count) & 0xffff;
3000                 if (peer->tcp_ts_stamp) {
3001                         ts = peer->tcp_ts;
3002                         tsage = get_seconds() - peer->tcp_ts_stamp;
3003                 }
3004                 expires = ACCESS_ONCE(peer->pmtu_expires);
3005                 if (expires) {
3006                         if (time_before(jiffies, expires))
3007                                 expires -= jiffies;
3008                         else
3009                                 expires = 0;
3010                 }
3011         }
3012
3013         if (rt_is_input_route(rt)) {
3014 #ifdef CONFIG_IP_MROUTE
3015                 __be32 dst = rt->rt_dst;
3016
3017                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3018                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3019                         int err = ipmr_get_route(net, skb,
3020                                                  rt->rt_src, rt->rt_dst,
3021                                                  r, nowait);
3022                         if (err <= 0) {
3023                                 if (!nowait) {
3024                                         if (err == 0)
3025                                                 return 0;
3026                                         goto nla_put_failure;
3027                                 } else {
3028                                         if (err == -EMSGSIZE)
3029                                                 goto nla_put_failure;
3030                                         error = err;
3031                                 }
3032                         }
3033                 } else
3034 #endif
3035                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3036                                 goto nla_put_failure;
3037         }
3038
3039         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3040                                expires, error) < 0)
3041                 goto nla_put_failure;
3042
3043         return nlmsg_end(skb, nlh);
3044
3045 nla_put_failure:
3046         nlmsg_cancel(skb, nlh);
3047         return -EMSGSIZE;
3048 }
3049
3050 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3051 {
3052         struct net *net = sock_net(in_skb->sk);
3053         struct rtmsg *rtm;
3054         struct nlattr *tb[RTA_MAX+1];
3055         struct rtable *rt = NULL;
3056         __be32 dst = 0;
3057         __be32 src = 0;
3058         u32 iif;
3059         int err;
3060         int mark;
3061         struct sk_buff *skb;
3062
3063         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3064         if (err < 0)
3065                 goto errout;
3066
3067         rtm = nlmsg_data(nlh);
3068
3069         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3070         if (skb == NULL) {
3071                 err = -ENOBUFS;
3072                 goto errout;
3073         }
3074
3075         /* Reserve room for dummy headers, this skb can pass
3076            through good chunk of routing engine.
3077          */
3078         skb_reset_mac_header(skb);
3079         skb_reset_network_header(skb);
3080
3081         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3082         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3083         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3084
3085         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3086         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3087         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3088         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3089
3090         if (iif) {
3091                 struct net_device *dev;
3092
3093                 dev = __dev_get_by_index(net, iif);
3094                 if (dev == NULL) {
3095                         err = -ENODEV;
3096                         goto errout_free;
3097                 }
3098
3099                 skb->protocol   = htons(ETH_P_IP);
3100                 skb->dev        = dev;
3101                 skb->mark       = mark;
3102                 local_bh_disable();
3103                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3104                 local_bh_enable();
3105
3106                 rt = skb_rtable(skb);
3107                 if (err == 0 && rt->dst.error)
3108                         err = -rt->dst.error;
3109         } else {
3110                 struct flowi4 fl4 = {
3111                         .daddr = dst,
3112                         .saddr = src,
3113                         .flowi4_tos = rtm->rtm_tos,
3114                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3115                         .flowi4_mark = mark,
3116                 };
3117                 rt = ip_route_output_key(net, &fl4);
3118
3119                 err = 0;
3120                 if (IS_ERR(rt))
3121                         err = PTR_ERR(rt);
3122         }
3123
3124         if (err)
3125                 goto errout_free;
3126
3127         skb_dst_set(skb, &rt->dst);
3128         if (rtm->rtm_flags & RTM_F_NOTIFY)
3129                 rt->rt_flags |= RTCF_NOTIFY;
3130
3131         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3132                            RTM_NEWROUTE, 0, 0);
3133         if (err <= 0)
3134                 goto errout_free;
3135
3136         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3137 errout:
3138         return err;
3139
3140 errout_free:
3141         kfree_skb(skb);
3142         goto errout;
3143 }
3144
3145 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3146 {
3147         struct rtable *rt;
3148         int h, s_h;
3149         int idx, s_idx;
3150         struct net *net;
3151
3152         net = sock_net(skb->sk);
3153
3154         s_h = cb->args[0];
3155         if (s_h < 0)
3156                 s_h = 0;
3157         s_idx = idx = cb->args[1];
3158         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3159                 if (!rt_hash_table[h].chain)
3160                         continue;
3161                 rcu_read_lock_bh();
3162                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3163                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3164                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3165                                 continue;
3166                         if (rt_is_expired(rt))
3167                                 continue;
3168                         skb_dst_set_noref(skb, &rt->dst);
3169                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3170                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3171                                          1, NLM_F_MULTI) <= 0) {
3172                                 skb_dst_drop(skb);
3173                                 rcu_read_unlock_bh();
3174                                 goto done;
3175                         }
3176                         skb_dst_drop(skb);
3177                 }
3178                 rcu_read_unlock_bh();
3179         }
3180
3181 done:
3182         cb->args[0] = h;
3183         cb->args[1] = idx;
3184         return skb->len;
3185 }
3186
3187 void ip_rt_multicast_event(struct in_device *in_dev)
3188 {
3189         rt_cache_flush(dev_net(in_dev->dev), 0);
3190 }
3191
3192 #ifdef CONFIG_SYSCTL
3193 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3194                                         void __user *buffer,
3195                                         size_t *lenp, loff_t *ppos)
3196 {
3197         if (write) {
3198                 int flush_delay;
3199                 ctl_table ctl;
3200                 struct net *net;
3201
3202                 memcpy(&ctl, __ctl, sizeof(ctl));
3203                 ctl.data = &flush_delay;
3204                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3205
3206                 net = (struct net *)__ctl->extra1;
3207                 rt_cache_flush(net, flush_delay);
3208                 return 0;
3209         }
3210
3211         return -EINVAL;
3212 }
3213
3214 static ctl_table ipv4_route_table[] = {
3215         {
3216                 .procname       = "gc_thresh",
3217                 .data           = &ipv4_dst_ops.gc_thresh,
3218                 .maxlen         = sizeof(int),
3219                 .mode           = 0644,
3220                 .proc_handler   = proc_dointvec,
3221         },
3222         {
3223                 .procname       = "max_size",
3224                 .data           = &ip_rt_max_size,
3225                 .maxlen         = sizeof(int),
3226                 .mode           = 0644,
3227                 .proc_handler   = proc_dointvec,
3228         },
3229         {
3230                 /*  Deprecated. Use gc_min_interval_ms */
3231
3232                 .procname       = "gc_min_interval",
3233                 .data           = &ip_rt_gc_min_interval,
3234                 .maxlen         = sizeof(int),
3235                 .mode           = 0644,
3236                 .proc_handler   = proc_dointvec_jiffies,
3237         },
3238         {
3239                 .procname       = "gc_min_interval_ms",
3240                 .data           = &ip_rt_gc_min_interval,
3241                 .maxlen         = sizeof(int),
3242                 .mode           = 0644,
3243                 .proc_handler   = proc_dointvec_ms_jiffies,
3244         },
3245         {
3246                 .procname       = "gc_timeout",
3247                 .data           = &ip_rt_gc_timeout,
3248                 .maxlen         = sizeof(int),
3249                 .mode           = 0644,
3250                 .proc_handler   = proc_dointvec_jiffies,
3251         },
3252         {
3253                 .procname       = "gc_interval",
3254                 .data           = &ip_rt_gc_interval,
3255                 .maxlen         = sizeof(int),
3256                 .mode           = 0644,
3257                 .proc_handler   = proc_dointvec_jiffies,
3258         },
3259         {
3260                 .procname       = "redirect_load",
3261                 .data           = &ip_rt_redirect_load,
3262                 .maxlen         = sizeof(int),
3263                 .mode           = 0644,
3264                 .proc_handler   = proc_dointvec,
3265         },
3266         {
3267                 .procname       = "redirect_number",
3268                 .data           = &ip_rt_redirect_number,
3269                 .maxlen         = sizeof(int),
3270                 .mode           = 0644,
3271                 .proc_handler   = proc_dointvec,
3272         },
3273         {
3274                 .procname       = "redirect_silence",
3275                 .data           = &ip_rt_redirect_silence,
3276                 .maxlen         = sizeof(int),
3277                 .mode           = 0644,
3278                 .proc_handler   = proc_dointvec,
3279         },
3280         {
3281                 .procname       = "error_cost",
3282                 .data           = &ip_rt_error_cost,
3283                 .maxlen         = sizeof(int),
3284                 .mode           = 0644,
3285                 .proc_handler   = proc_dointvec,
3286         },
3287         {
3288                 .procname       = "error_burst",
3289                 .data           = &ip_rt_error_burst,
3290                 .maxlen         = sizeof(int),
3291                 .mode           = 0644,
3292                 .proc_handler   = proc_dointvec,
3293         },
3294         {
3295                 .procname       = "gc_elasticity",
3296                 .data           = &ip_rt_gc_elasticity,
3297                 .maxlen         = sizeof(int),
3298                 .mode           = 0644,
3299                 .proc_handler   = proc_dointvec,
3300         },
3301         {
3302                 .procname       = "mtu_expires",
3303                 .data           = &ip_rt_mtu_expires,
3304                 .maxlen         = sizeof(int),
3305                 .mode           = 0644,
3306                 .proc_handler   = proc_dointvec_jiffies,
3307         },
3308         {
3309                 .procname       = "min_pmtu",
3310                 .data           = &ip_rt_min_pmtu,
3311                 .maxlen         = sizeof(int),
3312                 .mode           = 0644,
3313                 .proc_handler   = proc_dointvec,
3314         },
3315         {
3316                 .procname       = "min_adv_mss",
3317                 .data           = &ip_rt_min_advmss,
3318                 .maxlen         = sizeof(int),
3319                 .mode           = 0644,
3320                 .proc_handler   = proc_dointvec,
3321         },
3322         { }
3323 };
3324
3325 static struct ctl_table ipv4_route_flush_table[] = {
3326         {
3327                 .procname       = "flush",
3328                 .maxlen         = sizeof(int),
3329                 .mode           = 0200,
3330                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3331         },
3332         { },
3333 };
3334
3335 static __net_init int sysctl_route_net_init(struct net *net)
3336 {
3337         struct ctl_table *tbl;
3338
3339         tbl = ipv4_route_flush_table;
3340         if (!net_eq(net, &init_net)) {
3341                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3342                 if (tbl == NULL)
3343                         goto err_dup;
3344         }
3345         tbl[0].extra1 = net;
3346
3347         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3348         if (net->ipv4.route_hdr == NULL)
3349                 goto err_reg;
3350         return 0;
3351
3352 err_reg:
3353         if (tbl != ipv4_route_flush_table)
3354                 kfree(tbl);
3355 err_dup:
3356         return -ENOMEM;
3357 }
3358
3359 static __net_exit void sysctl_route_net_exit(struct net *net)
3360 {
3361         struct ctl_table *tbl;
3362
3363         tbl = net->ipv4.route_hdr->ctl_table_arg;
3364         unregister_net_sysctl_table(net->ipv4.route_hdr);
3365         BUG_ON(tbl == ipv4_route_flush_table);
3366         kfree(tbl);
3367 }
3368
3369 static __net_initdata struct pernet_operations sysctl_route_ops = {
3370         .init = sysctl_route_net_init,
3371         .exit = sysctl_route_net_exit,
3372 };
3373 #endif
3374
3375 static __net_init int rt_genid_init(struct net *net)
3376 {
3377         get_random_bytes(&net->ipv4.rt_genid,
3378                          sizeof(net->ipv4.rt_genid));
3379         get_random_bytes(&net->ipv4.dev_addr_genid,
3380                          sizeof(net->ipv4.dev_addr_genid));
3381         return 0;
3382 }
3383
3384 static __net_initdata struct pernet_operations rt_genid_ops = {
3385         .init = rt_genid_init,
3386 };
3387
3388 static int __net_init ipv4_inetpeer_init(struct net *net)
3389 {
3390         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3391
3392         if (!bp)
3393                 return -ENOMEM;
3394         inet_peer_base_init(bp);
3395         net->ipv4.peers = bp;
3396         return 0;
3397 }
3398
3399 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3400 {
3401         struct inet_peer_base *bp = net->ipv4.peers;
3402
3403         net->ipv4.peers = NULL;
3404         inetpeer_invalidate_tree(bp);
3405         kfree(bp);
3406 }
3407
3408 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3409         .init   =       ipv4_inetpeer_init,
3410         .exit   =       ipv4_inetpeer_exit,
3411 };
3412
3413 #ifdef CONFIG_IP_ROUTE_CLASSID
3414 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3415 #endif /* CONFIG_IP_ROUTE_CLASSID */
3416
3417 static __initdata unsigned long rhash_entries;
3418 static int __init set_rhash_entries(char *str)
3419 {
3420         ssize_t ret;
3421
3422         if (!str)
3423                 return 0;
3424
3425         ret = kstrtoul(str, 0, &rhash_entries);
3426         if (ret)
3427                 return 0;
3428
3429         return 1;
3430 }
3431 __setup("rhash_entries=", set_rhash_entries);
3432
3433 int __init ip_rt_init(void)
3434 {
3435         int rc = 0;
3436
3437 #ifdef CONFIG_IP_ROUTE_CLASSID
3438         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3439         if (!ip_rt_acct)
3440                 panic("IP: failed to allocate ip_rt_acct\n");
3441 #endif
3442
3443         ipv4_dst_ops.kmem_cachep =
3444                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3445                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3446
3447         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3448
3449         if (dst_entries_init(&ipv4_dst_ops) < 0)
3450                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3451
3452         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3453                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3454
3455         rt_hash_table = (struct rt_hash_bucket *)
3456                 alloc_large_system_hash("IP route cache",
3457                                         sizeof(struct rt_hash_bucket),
3458                                         rhash_entries,
3459                                         (totalram_pages >= 128 * 1024) ?
3460                                         15 : 17,
3461                                         0,
3462                                         &rt_hash_log,
3463                                         &rt_hash_mask,
3464                                         0,
3465                                         rhash_entries ? 0 : 512 * 1024);
3466         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3467         rt_hash_lock_init();
3468
3469         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3470         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3471
3472         devinet_init();
3473         ip_fib_init();
3474
3475         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3476         expires_ljiffies = jiffies;
3477         schedule_delayed_work(&expires_work,
3478                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3479
3480         if (ip_rt_proc_init())
3481                 pr_err("Unable to create route proc files\n");
3482 #ifdef CONFIG_XFRM
3483         xfrm_init();
3484         xfrm4_init(ip_rt_max_size);
3485 #endif
3486         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3487
3488 #ifdef CONFIG_SYSCTL
3489         register_pernet_subsys(&sysctl_route_ops);
3490 #endif
3491         register_pernet_subsys(&rt_genid_ops);
3492         register_pernet_subsys(&ipv4_inetpeer_ops);
3493         return rc;
3494 }
3495
3496 #ifdef CONFIG_SYSCTL
3497 /*
3498  * We really need to sanitize the damn ipv4 init order, then all
3499  * this nonsense will go away.
3500  */
3501 void __init ip_static_sysctl_init(void)
3502 {
3503         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3504 }
3505 #endif