2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #define pr_fmt(fmt) "IPv4: " fmt
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
114 #include <net/secure_seq.h>
116 #define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
119 #define IP_MAX_MTU 0xFFF0
121 #define RT_GC_TIMEOUT (300*HZ)
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
127 static int ip_rt_redirect_number __read_mostly = 9;
128 static int ip_rt_redirect_load __read_mostly = HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly = HZ;
131 static int ip_rt_error_burst __read_mostly = 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly = 8;
133 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly = 256;
136 static int rt_chain_length_max __read_mostly = 20;
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
142 * Interface to generic destination cache.
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int ipv4_mtu(const struct dst_entry *dst);
148 static void ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void ipv4_link_failure(struct sk_buff *skb);
151 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
152 struct sk_buff *skb, u32 mtu);
153 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
154 struct sk_buff *skb);
155 static int rt_garbage_collect(struct dst_ops *ops);
157 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
162 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
168 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
172 static struct dst_ops ipv4_dst_ops = {
174 .protocol = cpu_to_be16(ETH_P_IP),
175 .gc = rt_garbage_collect,
176 .check = ipv4_dst_check,
177 .default_advmss = ipv4_default_advmss,
179 .cow_metrics = ipv4_cow_metrics,
180 .destroy = ipv4_dst_destroy,
181 .ifdown = ipv4_dst_ifdown,
182 .negative_advice = ipv4_negative_advice,
183 .link_failure = ipv4_link_failure,
184 .update_pmtu = ip_rt_update_pmtu,
185 .redirect = ip_do_redirect,
186 .local_out = __ip_local_out,
187 .neigh_lookup = ipv4_neigh_lookup,
190 #define ECN_OR_COST(class) TC_PRIO_##class
192 const __u8 ip_tos2prio[16] = {
194 ECN_OR_COST(BESTEFFORT),
196 ECN_OR_COST(BESTEFFORT),
202 ECN_OR_COST(INTERACTIVE),
204 ECN_OR_COST(INTERACTIVE),
205 TC_PRIO_INTERACTIVE_BULK,
206 ECN_OR_COST(INTERACTIVE_BULK),
207 TC_PRIO_INTERACTIVE_BULK,
208 ECN_OR_COST(INTERACTIVE_BULK)
210 EXPORT_SYMBOL(ip_tos2prio);
216 /* The locking scheme is rather straight forward:
218 * 1) Read-Copy Update protects the buckets of the central route hash.
219 * 2) Only writers remove entries, and they hold the lock
220 * as they look at rtable reference counts.
221 * 3) Only readers acquire references to rtable entries,
222 * they do so with atomic increments and with the
226 struct rt_hash_bucket {
227 struct rtable __rcu *chain;
230 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
231 defined(CONFIG_PROVE_LOCKING)
233 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
234 * The size of this table is a power of two and depends on the number of CPUS.
235 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
237 #ifdef CONFIG_LOCKDEP
238 # define RT_HASH_LOCK_SZ 256
241 # define RT_HASH_LOCK_SZ 4096
243 # define RT_HASH_LOCK_SZ 2048
245 # define RT_HASH_LOCK_SZ 1024
247 # define RT_HASH_LOCK_SZ 512
249 # define RT_HASH_LOCK_SZ 256
253 static spinlock_t *rt_hash_locks;
254 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
256 static __init void rt_hash_lock_init(void)
260 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
263 panic("IP: failed to allocate rt_hash_locks\n");
265 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
266 spin_lock_init(&rt_hash_locks[i]);
269 # define rt_hash_lock_addr(slot) NULL
271 static inline void rt_hash_lock_init(void)
276 static struct rt_hash_bucket *rt_hash_table __read_mostly;
277 static unsigned int rt_hash_mask __read_mostly;
278 static unsigned int rt_hash_log __read_mostly;
280 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
281 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
283 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
286 return jhash_3words((__force u32)daddr, (__force u32)saddr,
291 static inline int rt_genid(struct net *net)
293 return atomic_read(&net->ipv4.rt_genid);
296 #ifdef CONFIG_PROC_FS
297 struct rt_cache_iter_state {
298 struct seq_net_private p;
303 static struct rtable *rt_cache_get_first(struct seq_file *seq)
305 struct rt_cache_iter_state *st = seq->private;
306 struct rtable *r = NULL;
308 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
309 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
312 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
314 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
315 r->rt_genid == st->genid)
317 r = rcu_dereference_bh(r->dst.rt_next);
319 rcu_read_unlock_bh();
324 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
327 struct rt_cache_iter_state *st = seq->private;
329 r = rcu_dereference_bh(r->dst.rt_next);
331 rcu_read_unlock_bh();
333 if (--st->bucket < 0)
335 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
337 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
342 static struct rtable *rt_cache_get_next(struct seq_file *seq,
345 struct rt_cache_iter_state *st = seq->private;
346 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
347 if (dev_net(r->dst.dev) != seq_file_net(seq))
349 if (r->rt_genid == st->genid)
355 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
357 struct rtable *r = rt_cache_get_first(seq);
360 while (pos && (r = rt_cache_get_next(seq, r)))
362 return pos ? NULL : r;
365 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
367 struct rt_cache_iter_state *st = seq->private;
369 return rt_cache_get_idx(seq, *pos - 1);
370 st->genid = rt_genid(seq_file_net(seq));
371 return SEQ_START_TOKEN;
374 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
378 if (v == SEQ_START_TOKEN)
379 r = rt_cache_get_first(seq);
381 r = rt_cache_get_next(seq, v);
386 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
388 if (v && v != SEQ_START_TOKEN)
389 rcu_read_unlock_bh();
392 static int rt_cache_seq_show(struct seq_file *seq, void *v)
394 if (v == SEQ_START_TOKEN)
395 seq_printf(seq, "%-127s\n",
396 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
397 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
400 struct rtable *r = v;
403 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
404 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
405 r->dst.dev ? r->dst.dev->name : "*",
406 (__force u32)r->rt_dst,
407 (__force u32)r->rt_gateway,
408 r->rt_flags, atomic_read(&r->dst.__refcnt),
409 r->dst.__use, 0, (__force u32)r->rt_src,
410 dst_metric_advmss(&r->dst) + 40,
411 dst_metric(&r->dst, RTAX_WINDOW), 0,
415 seq_printf(seq, "%*s\n", 127 - len, "");
420 static const struct seq_operations rt_cache_seq_ops = {
421 .start = rt_cache_seq_start,
422 .next = rt_cache_seq_next,
423 .stop = rt_cache_seq_stop,
424 .show = rt_cache_seq_show,
427 static int rt_cache_seq_open(struct inode *inode, struct file *file)
429 return seq_open_net(inode, file, &rt_cache_seq_ops,
430 sizeof(struct rt_cache_iter_state));
433 static const struct file_operations rt_cache_seq_fops = {
434 .owner = THIS_MODULE,
435 .open = rt_cache_seq_open,
438 .release = seq_release_net,
442 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
447 return SEQ_START_TOKEN;
449 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
450 if (!cpu_possible(cpu))
453 return &per_cpu(rt_cache_stat, cpu);
458 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
462 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
463 if (!cpu_possible(cpu))
466 return &per_cpu(rt_cache_stat, cpu);
472 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
477 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
479 struct rt_cache_stat *st = v;
481 if (v == SEQ_START_TOKEN) {
482 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
486 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
487 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
488 dst_entries_get_slow(&ipv4_dst_ops),
511 static const struct seq_operations rt_cpu_seq_ops = {
512 .start = rt_cpu_seq_start,
513 .next = rt_cpu_seq_next,
514 .stop = rt_cpu_seq_stop,
515 .show = rt_cpu_seq_show,
519 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
521 return seq_open(file, &rt_cpu_seq_ops);
524 static const struct file_operations rt_cpu_seq_fops = {
525 .owner = THIS_MODULE,
526 .open = rt_cpu_seq_open,
529 .release = seq_release,
532 #ifdef CONFIG_IP_ROUTE_CLASSID
533 static int rt_acct_proc_show(struct seq_file *m, void *v)
535 struct ip_rt_acct *dst, *src;
538 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
542 for_each_possible_cpu(i) {
543 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
544 for (j = 0; j < 256; j++) {
545 dst[j].o_bytes += src[j].o_bytes;
546 dst[j].o_packets += src[j].o_packets;
547 dst[j].i_bytes += src[j].i_bytes;
548 dst[j].i_packets += src[j].i_packets;
552 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
557 static int rt_acct_proc_open(struct inode *inode, struct file *file)
559 return single_open(file, rt_acct_proc_show, NULL);
562 static const struct file_operations rt_acct_proc_fops = {
563 .owner = THIS_MODULE,
564 .open = rt_acct_proc_open,
567 .release = single_release,
571 static int __net_init ip_rt_do_proc_init(struct net *net)
573 struct proc_dir_entry *pde;
575 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
580 pde = proc_create("rt_cache", S_IRUGO,
581 net->proc_net_stat, &rt_cpu_seq_fops);
585 #ifdef CONFIG_IP_ROUTE_CLASSID
586 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
592 #ifdef CONFIG_IP_ROUTE_CLASSID
594 remove_proc_entry("rt_cache", net->proc_net_stat);
597 remove_proc_entry("rt_cache", net->proc_net);
602 static void __net_exit ip_rt_do_proc_exit(struct net *net)
604 remove_proc_entry("rt_cache", net->proc_net_stat);
605 remove_proc_entry("rt_cache", net->proc_net);
606 #ifdef CONFIG_IP_ROUTE_CLASSID
607 remove_proc_entry("rt_acct", net->proc_net);
611 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
612 .init = ip_rt_do_proc_init,
613 .exit = ip_rt_do_proc_exit,
616 static int __init ip_rt_proc_init(void)
618 return register_pernet_subsys(&ip_rt_proc_ops);
622 static inline int ip_rt_proc_init(void)
626 #endif /* CONFIG_PROC_FS */
628 static inline void rt_free(struct rtable *rt)
630 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
633 static inline void rt_drop(struct rtable *rt)
636 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
639 static inline int rt_fast_clean(struct rtable *rth)
641 /* Kill broadcast/multicast entries very aggresively, if they
642 collide in hash table with more useful entries */
643 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
644 rt_is_input_route(rth) && rth->dst.rt_next;
647 static inline int rt_valuable(struct rtable *rth)
649 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
653 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
658 if (atomic_read(&rth->dst.__refcnt))
661 age = jiffies - rth->dst.lastuse;
662 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
663 (age <= tmo2 && rt_valuable(rth)))
669 /* Bits of score are:
671 * 30: not quite useless
672 * 29..0: usage counter
674 static inline u32 rt_score(struct rtable *rt)
676 u32 score = jiffies - rt->dst.lastuse;
678 score = ~score & ~(3<<30);
683 if (rt_is_output_route(rt) ||
684 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
690 static inline bool rt_caching(const struct net *net)
692 return net->ipv4.current_rt_cache_rebuild_count <=
693 net->ipv4.sysctl_rt_cache_rebuild_count;
696 static inline bool compare_hash_inputs(const struct rtable *rt1,
697 const struct rtable *rt2)
699 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
700 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
701 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
704 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
706 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
707 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
708 (rt1->rt_mark ^ rt2->rt_mark) |
709 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
710 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
711 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
714 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
716 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
719 static inline int rt_is_expired(struct rtable *rth)
721 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
725 * Perform a full scan of hash table and free all entries.
726 * Can be called by a softirq or a process.
727 * In the later case, we want to be reschedule if necessary
729 static void rt_do_flush(struct net *net, int process_context)
732 struct rtable *rth, *next;
734 for (i = 0; i <= rt_hash_mask; i++) {
735 struct rtable __rcu **pprev;
738 if (process_context && need_resched())
740 rth = rcu_access_pointer(rt_hash_table[i].chain);
744 spin_lock_bh(rt_hash_lock_addr(i));
747 pprev = &rt_hash_table[i].chain;
748 rth = rcu_dereference_protected(*pprev,
749 lockdep_is_held(rt_hash_lock_addr(i)));
752 next = rcu_dereference_protected(rth->dst.rt_next,
753 lockdep_is_held(rt_hash_lock_addr(i)));
756 net_eq(dev_net(rth->dst.dev), net)) {
757 rcu_assign_pointer(*pprev, next);
758 rcu_assign_pointer(rth->dst.rt_next, list);
761 pprev = &rth->dst.rt_next;
766 spin_unlock_bh(rt_hash_lock_addr(i));
768 for (; list; list = next) {
769 next = rcu_dereference_protected(list->dst.rt_next, 1);
776 * While freeing expired entries, we compute average chain length
777 * and standard deviation, using fixed-point arithmetic.
778 * This to have an estimation of rt_chain_length_max
779 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
780 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
784 #define ONE (1UL << FRACT_BITS)
787 * Given a hash chain and an item in this hash chain,
788 * find if a previous entry has the same hash_inputs
789 * (but differs on tos, mark or oif)
790 * Returns 0 if an alias is found.
791 * Returns ONE if rth has no alias before itself.
793 static int has_noalias(const struct rtable *head, const struct rtable *rth)
795 const struct rtable *aux = head;
798 if (compare_hash_inputs(aux, rth))
800 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
805 static void rt_check_expire(void)
807 static unsigned int rover;
808 unsigned int i = rover, goal;
810 struct rtable __rcu **rthp;
811 unsigned long samples = 0;
812 unsigned long sum = 0, sum2 = 0;
816 delta = jiffies - expires_ljiffies;
817 expires_ljiffies = jiffies;
818 mult = ((u64)delta) << rt_hash_log;
819 if (ip_rt_gc_timeout > 1)
820 do_div(mult, ip_rt_gc_timeout);
821 goal = (unsigned int)mult;
822 if (goal > rt_hash_mask)
823 goal = rt_hash_mask + 1;
824 for (; goal > 0; goal--) {
825 unsigned long tmo = ip_rt_gc_timeout;
826 unsigned long length;
828 i = (i + 1) & rt_hash_mask;
829 rthp = &rt_hash_table[i].chain;
836 if (rcu_dereference_raw(*rthp) == NULL)
839 spin_lock_bh(rt_hash_lock_addr(i));
840 while ((rth = rcu_dereference_protected(*rthp,
841 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
842 prefetch(rth->dst.rt_next);
843 if (rt_is_expired(rth) ||
844 rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
845 *rthp = rth->dst.rt_next;
850 /* We only count entries on a chain with equal
851 * hash inputs once so that entries for
852 * different QOS levels, and other non-hash
853 * input attributes don't unfairly skew the
857 rthp = &rth->dst.rt_next;
858 length += has_noalias(rt_hash_table[i].chain, rth);
860 spin_unlock_bh(rt_hash_lock_addr(i));
862 sum2 += length*length;
865 unsigned long avg = sum / samples;
866 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
867 rt_chain_length_max = max_t(unsigned long,
869 (avg + 4*sd) >> FRACT_BITS);
875 * rt_worker_func() is run in process context.
876 * we call rt_check_expire() to scan part of the hash table
878 static void rt_worker_func(struct work_struct *work)
881 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
885 * Perturbation of rt_genid by a small quantity [1..256]
886 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
887 * many times (2^24) without giving recent rt_genid.
888 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
890 static void rt_cache_invalidate(struct net *net)
892 unsigned char shuffle;
894 get_random_bytes(&shuffle, sizeof(shuffle));
895 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
899 * delay < 0 : invalidate cache (fast : entries will be deleted later)
900 * delay >= 0 : invalidate & flush cache (can be long)
902 void rt_cache_flush(struct net *net, int delay)
904 rt_cache_invalidate(net);
906 rt_do_flush(net, !in_softirq());
909 /* Flush previous cache invalidated entries from the cache */
910 void rt_cache_flush_batch(struct net *net)
912 rt_do_flush(net, !in_softirq());
915 static void rt_emergency_hash_rebuild(struct net *net)
917 net_warn_ratelimited("Route hash chain too long!\n");
918 rt_cache_invalidate(net);
922 Short description of GC goals.
924 We want to build algorithm, which will keep routing cache
925 at some equilibrium point, when number of aged off entries
926 is kept approximately equal to newly generated ones.
928 Current expiration strength is variable "expire".
929 We try to adjust it dynamically, so that if networking
930 is idle expires is large enough to keep enough of warm entries,
931 and when load increases it reduces to limit cache size.
934 static int rt_garbage_collect(struct dst_ops *ops)
936 static unsigned long expire = RT_GC_TIMEOUT;
937 static unsigned long last_gc;
939 static int equilibrium;
941 struct rtable __rcu **rthp;
942 unsigned long now = jiffies;
944 int entries = dst_entries_get_fast(&ipv4_dst_ops);
947 * Garbage collection is pretty expensive,
948 * do not make it too frequently.
951 RT_CACHE_STAT_INC(gc_total);
953 if (now - last_gc < ip_rt_gc_min_interval &&
954 entries < ip_rt_max_size) {
955 RT_CACHE_STAT_INC(gc_ignored);
959 entries = dst_entries_get_slow(&ipv4_dst_ops);
960 /* Calculate number of entries, which we want to expire now. */
961 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
963 if (equilibrium < ipv4_dst_ops.gc_thresh)
964 equilibrium = ipv4_dst_ops.gc_thresh;
965 goal = entries - equilibrium;
967 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
968 goal = entries - equilibrium;
971 /* We are in dangerous area. Try to reduce cache really
974 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
975 equilibrium = entries - goal;
978 if (now - last_gc >= ip_rt_gc_min_interval)
989 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
990 unsigned long tmo = expire;
992 k = (k + 1) & rt_hash_mask;
993 rthp = &rt_hash_table[k].chain;
994 spin_lock_bh(rt_hash_lock_addr(k));
995 while ((rth = rcu_dereference_protected(*rthp,
996 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
997 if (!rt_is_expired(rth) &&
998 !rt_may_expire(rth, tmo, expire)) {
1000 rthp = &rth->dst.rt_next;
1003 *rthp = rth->dst.rt_next;
1007 spin_unlock_bh(rt_hash_lock_addr(k));
1016 /* Goal is not achieved. We stop process if:
1018 - if expire reduced to zero. Otherwise, expire is halfed.
1019 - if table is not full.
1020 - if we are called from interrupt.
1021 - jiffies check is just fallback/debug loop breaker.
1022 We will not spin here for long time in any case.
1025 RT_CACHE_STAT_INC(gc_goal_miss);
1032 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1034 } while (!in_softirq() && time_before_eq(jiffies, now));
1036 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1038 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1040 net_warn_ratelimited("dst cache overflow\n");
1041 RT_CACHE_STAT_INC(gc_dst_overflow);
1045 expire += ip_rt_gc_min_interval;
1046 if (expire > ip_rt_gc_timeout ||
1047 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1048 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1049 expire = ip_rt_gc_timeout;
1054 * Returns number of entries in a hash chain that have different hash_inputs
1056 static int slow_chain_length(const struct rtable *head)
1059 const struct rtable *rth = head;
1062 length += has_noalias(head, rth);
1063 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1065 return length >> FRACT_BITS;
1068 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1069 struct sk_buff *skb,
1072 struct net_device *dev = dst->dev;
1073 const __be32 *pkey = daddr;
1074 const struct rtable *rt;
1075 struct neighbour *n;
1077 rt = (const struct rtable *) dst;
1079 pkey = (const __be32 *) &rt->rt_gateway;
1081 pkey = &ip_hdr(skb)->daddr;
1083 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1086 return neigh_create(&arp_tbl, pkey, dev);
1089 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1090 struct sk_buff *skb, int ifindex)
1092 struct rtable *rth, *cand;
1093 struct rtable __rcu **rthp, **candp;
1100 min_score = ~(u32)0;
1105 if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1107 * If we're not caching, just tell the caller we
1108 * were successful and don't touch the route. The
1109 * caller hold the sole reference to the cache entry, and
1110 * it will be released when the caller is done with it.
1111 * If we drop it here, the callers have no way to resolve routes
1112 * when we're not caching. Instead, just point *rp at rt, so
1113 * the caller gets a single use out of the route
1114 * Note that we do rt_free on this new route entry, so that
1115 * once its refcount hits zero, we are still able to reap it
1117 * Note: To avoid expensive rcu stuff for this uncached dst,
1118 * we set DST_NOCACHE so that dst_release() can free dst without
1119 * waiting a grace period.
1122 rt->dst.flags |= DST_NOCACHE;
1126 rthp = &rt_hash_table[hash].chain;
1128 spin_lock_bh(rt_hash_lock_addr(hash));
1129 while ((rth = rcu_dereference_protected(*rthp,
1130 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1131 if (rt_is_expired(rth)) {
1132 *rthp = rth->dst.rt_next;
1136 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1138 *rthp = rth->dst.rt_next;
1140 * Since lookup is lockfree, the deletion
1141 * must be visible to another weakly ordered CPU before
1142 * the insertion at the start of the hash chain.
1144 rcu_assign_pointer(rth->dst.rt_next,
1145 rt_hash_table[hash].chain);
1147 * Since lookup is lockfree, the update writes
1148 * must be ordered for consistency on SMP.
1150 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1152 dst_use(&rth->dst, now);
1153 spin_unlock_bh(rt_hash_lock_addr(hash));
1157 skb_dst_set(skb, &rth->dst);
1161 if (!atomic_read(&rth->dst.__refcnt)) {
1162 u32 score = rt_score(rth);
1164 if (score <= min_score) {
1173 rthp = &rth->dst.rt_next;
1177 /* ip_rt_gc_elasticity used to be average length of chain
1178 * length, when exceeded gc becomes really aggressive.
1180 * The second limit is less certain. At the moment it allows
1181 * only 2 entries per bucket. We will see.
1183 if (chain_length > ip_rt_gc_elasticity) {
1184 *candp = cand->dst.rt_next;
1188 if (chain_length > rt_chain_length_max &&
1189 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1190 struct net *net = dev_net(rt->dst.dev);
1191 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1192 if (!rt_caching(net)) {
1193 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1194 rt->dst.dev->name, num);
1196 rt_emergency_hash_rebuild(net);
1197 spin_unlock_bh(rt_hash_lock_addr(hash));
1199 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1200 ifindex, rt_genid(net));
1205 rt->dst.rt_next = rt_hash_table[hash].chain;
1208 * Since lookup is lockfree, we must make sure
1209 * previous writes to rt are committed to memory
1210 * before making rt visible to other CPUS.
1212 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1214 spin_unlock_bh(rt_hash_lock_addr(hash));
1218 skb_dst_set(skb, &rt->dst);
1223 * Peer allocation may fail only in serious out-of-memory conditions. However
1224 * we still can generate some output.
1225 * Random ID selection looks a bit dangerous because we have no chances to
1226 * select ID being unique in a reasonable period of time.
1227 * But broken packet identifier may be better than no packet at all.
1229 static void ip_select_fb_ident(struct iphdr *iph)
1231 static DEFINE_SPINLOCK(ip_fb_id_lock);
1232 static u32 ip_fallback_id;
1235 spin_lock_bh(&ip_fb_id_lock);
1236 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1237 iph->id = htons(salt & 0xFFFF);
1238 ip_fallback_id = salt;
1239 spin_unlock_bh(&ip_fb_id_lock);
1242 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1244 struct net *net = dev_net(dst->dev);
1245 struct inet_peer *peer;
1247 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
1249 iph->id = htons(inet_getid(peer, more));
1254 ip_select_fb_ident(iph);
1256 EXPORT_SYMBOL(__ip_select_ident);
1258 static void rt_del(unsigned int hash, struct rtable *rt)
1260 struct rtable __rcu **rthp;
1263 rthp = &rt_hash_table[hash].chain;
1264 spin_lock_bh(rt_hash_lock_addr(hash));
1266 while ((aux = rcu_dereference_protected(*rthp,
1267 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1268 if (aux == rt || rt_is_expired(aux)) {
1269 *rthp = aux->dst.rt_next;
1273 rthp = &aux->dst.rt_next;
1275 spin_unlock_bh(rt_hash_lock_addr(hash));
1278 static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
1279 const struct iphdr *iph,
1281 u8 prot, u32 mark, int flow_flags)
1284 const struct inet_sock *inet = inet_sk(sk);
1286 oif = sk->sk_bound_dev_if;
1288 tos = RT_CONN_FLAGS(sk);
1289 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
1291 flowi4_init_output(fl4, oif, mark, tos,
1292 RT_SCOPE_UNIVERSE, prot,
1294 iph->daddr, iph->saddr, 0, 0);
1297 static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct sock *sk)
1299 const struct iphdr *iph = ip_hdr(skb);
1300 int oif = skb->dev->ifindex;
1301 u8 tos = RT_TOS(iph->tos);
1302 u8 prot = iph->protocol;
1303 u32 mark = skb->mark;
1305 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
1308 static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
1310 const struct inet_sock *inet = inet_sk(sk);
1311 struct ip_options_rcu *inet_opt;
1312 __be32 daddr = inet->inet_daddr;
1315 inet_opt = rcu_dereference(inet->inet_opt);
1316 if (inet_opt && inet_opt->opt.srr)
1317 daddr = inet_opt->opt.faddr;
1318 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
1319 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
1320 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1321 inet_sk_flowi_flags(sk),
1322 daddr, inet->inet_saddr, 0, 0);
1326 static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
1327 struct sk_buff *skb)
1330 build_skb_flow_key(fl4, skb, sk);
1332 build_sk_flow_key(fl4, sk);
1335 static DEFINE_SPINLOCK(fnhe_lock);
1337 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
1339 struct fib_nh_exception *fnhe, *oldest;
1341 oldest = rcu_dereference(hash->chain);
1342 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
1343 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1344 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
1350 static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
1352 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1353 struct fib_nh_exception *fnhe;
1358 hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
1364 hval = (__force u32) daddr;
1365 hval ^= (hval >> 11) ^ (hval >> 22);
1369 for (fnhe = rcu_dereference(hash->chain); fnhe;
1370 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1371 if (fnhe->fnhe_daddr == daddr)
1376 if (depth > FNHE_RECLAIM_DEPTH) {
1377 fnhe = fnhe_oldest(hash + hval, daddr);
1380 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
1384 fnhe->fnhe_next = hash->chain;
1385 rcu_assign_pointer(hash->chain, fnhe);
1388 fnhe->fnhe_daddr = daddr;
1390 fnhe->fnhe_stamp = jiffies;
1394 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
1396 __be32 new_gw = icmp_hdr(skb)->un.gateway;
1397 __be32 old_gw = ip_hdr(skb)->saddr;
1398 struct net_device *dev = skb->dev;
1399 struct in_device *in_dev;
1400 struct fib_result res;
1401 struct neighbour *n;
1404 switch (icmp_hdr(skb)->code & 7) {
1405 case ICMP_REDIR_NET:
1406 case ICMP_REDIR_NETTOS:
1407 case ICMP_REDIR_HOST:
1408 case ICMP_REDIR_HOSTTOS:
1415 if (rt->rt_gateway != old_gw)
1418 in_dev = __in_dev_get_rcu(dev);
1423 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1424 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1425 ipv4_is_zeronet(new_gw))
1426 goto reject_redirect;
1428 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1429 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1430 goto reject_redirect;
1431 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1432 goto reject_redirect;
1434 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1435 goto reject_redirect;
1438 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
1440 if (!(n->nud_state & NUD_VALID)) {
1441 neigh_event_send(n, NULL);
1443 if (fib_lookup(net, fl4, &res) == 0) {
1444 struct fib_nh *nh = &FIB_RES_NH(res);
1445 struct fib_nh_exception *fnhe;
1447 spin_lock_bh(&fnhe_lock);
1448 fnhe = find_or_create_fnhe(nh, fl4->daddr);
1450 fnhe->fnhe_gw = new_gw;
1451 spin_unlock_bh(&fnhe_lock);
1453 rt->rt_gateway = new_gw;
1454 rt->rt_flags |= RTCF_REDIRECTED;
1455 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1462 #ifdef CONFIG_IP_ROUTE_VERBOSE
1463 if (IN_DEV_LOG_MARTIANS(in_dev)) {
1464 const struct iphdr *iph = (const struct iphdr *) skb->data;
1465 __be32 daddr = iph->daddr;
1466 __be32 saddr = iph->saddr;
1468 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1469 " Advised path = %pI4 -> %pI4\n",
1470 &old_gw, dev->name, &new_gw,
1477 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
1482 rt = (struct rtable *) dst;
1484 ip_rt_build_flow_key(&fl4, sk, skb);
1485 __ip_do_redirect(rt, skb, &fl4);
1488 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1490 struct rtable *rt = (struct rtable *)dst;
1491 struct dst_entry *ret = dst;
1494 if (dst->obsolete > 0) {
1497 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1499 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1501 rt_genid(dev_net(dst->dev)));
1511 * 1. The first ip_rt_redirect_number redirects are sent
1512 * with exponential backoff, then we stop sending them at all,
1513 * assuming that the host ignores our redirects.
1514 * 2. If we did not see packets requiring redirects
1515 * during ip_rt_redirect_silence, we assume that the host
1516 * forgot redirected route and start to send redirects again.
1518 * This algorithm is much cheaper and more intelligent than dumb load limiting
1521 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1522 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1525 void ip_rt_send_redirect(struct sk_buff *skb)
1527 struct rtable *rt = skb_rtable(skb);
1528 struct in_device *in_dev;
1529 struct inet_peer *peer;
1534 in_dev = __in_dev_get_rcu(rt->dst.dev);
1535 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1539 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1542 net = dev_net(rt->dst.dev);
1543 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1545 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1549 /* No redirected packets during ip_rt_redirect_silence;
1550 * reset the algorithm.
1552 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1553 peer->rate_tokens = 0;
1555 /* Too many ignored redirects; do not send anything
1556 * set dst.rate_last to the last seen redirected packet.
1558 if (peer->rate_tokens >= ip_rt_redirect_number) {
1559 peer->rate_last = jiffies;
1563 /* Check for load limit; set rate_last to the latest sent
1566 if (peer->rate_tokens == 0 ||
1569 (ip_rt_redirect_load << peer->rate_tokens)))) {
1570 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1571 peer->rate_last = jiffies;
1572 ++peer->rate_tokens;
1573 #ifdef CONFIG_IP_ROUTE_VERBOSE
1575 peer->rate_tokens == ip_rt_redirect_number)
1576 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1577 &ip_hdr(skb)->saddr, rt->rt_iif,
1578 &rt->rt_dst, &rt->rt_gateway);
1585 static int ip_error(struct sk_buff *skb)
1587 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1588 struct rtable *rt = skb_rtable(skb);
1589 struct inet_peer *peer;
1595 net = dev_net(rt->dst.dev);
1596 if (!IN_DEV_FORWARD(in_dev)) {
1597 switch (rt->dst.error) {
1599 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1603 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1609 switch (rt->dst.error) {
1614 code = ICMP_HOST_UNREACH;
1617 code = ICMP_NET_UNREACH;
1618 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1621 code = ICMP_PKT_FILTERED;
1625 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
1630 peer->rate_tokens += now - peer->rate_last;
1631 if (peer->rate_tokens > ip_rt_error_burst)
1632 peer->rate_tokens = ip_rt_error_burst;
1633 peer->rate_last = now;
1634 if (peer->rate_tokens >= ip_rt_error_cost)
1635 peer->rate_tokens -= ip_rt_error_cost;
1641 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1643 out: kfree_skb(skb);
1647 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1649 struct fib_result res;
1651 if (mtu < ip_rt_min_pmtu)
1652 mtu = ip_rt_min_pmtu;
1654 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
1655 struct fib_nh *nh = &FIB_RES_NH(res);
1656 struct fib_nh_exception *fnhe;
1658 spin_lock_bh(&fnhe_lock);
1659 fnhe = find_or_create_fnhe(nh, fl4->daddr);
1661 fnhe->fnhe_pmtu = mtu;
1662 fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
1664 spin_unlock_bh(&fnhe_lock);
1667 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1670 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1671 struct sk_buff *skb, u32 mtu)
1673 struct rtable *rt = (struct rtable *) dst;
1676 ip_rt_build_flow_key(&fl4, sk, skb);
1677 __ip_rt_update_pmtu(rt, &fl4, mtu);
1680 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1681 int oif, u32 mark, u8 protocol, int flow_flags)
1683 const struct iphdr *iph = (const struct iphdr *) skb->data;
1687 __build_flow_key(&fl4, NULL, iph, oif,
1688 RT_TOS(iph->tos), protocol, mark, flow_flags);
1689 rt = __ip_route_output_key(net, &fl4);
1691 __ip_rt_update_pmtu(rt, &fl4, mtu);
1695 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1697 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1699 const struct iphdr *iph = (const struct iphdr *) skb->data;
1703 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1704 rt = __ip_route_output_key(sock_net(sk), &fl4);
1706 __ip_rt_update_pmtu(rt, &fl4, mtu);
1710 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1712 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1713 int oif, u32 mark, u8 protocol, int flow_flags)
1715 const struct iphdr *iph = (const struct iphdr *) skb->data;
1719 __build_flow_key(&fl4, NULL, iph, oif,
1720 RT_TOS(iph->tos), protocol, mark, flow_flags);
1721 rt = __ip_route_output_key(net, &fl4);
1723 __ip_do_redirect(rt, skb, &fl4);
1727 EXPORT_SYMBOL_GPL(ipv4_redirect);
1729 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1731 const struct iphdr *iph = (const struct iphdr *) skb->data;
1735 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1736 rt = __ip_route_output_key(sock_net(sk), &fl4);
1738 __ip_do_redirect(rt, skb, &fl4);
1742 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1744 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1746 struct rtable *rt = (struct rtable *) dst;
1748 if (rt_is_expired(rt))
1753 static void ipv4_dst_destroy(struct dst_entry *dst)
1755 struct rtable *rt = (struct rtable *) dst;
1758 fib_info_put(rt->fi);
1764 static void ipv4_link_failure(struct sk_buff *skb)
1768 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1770 rt = skb_rtable(skb);
1772 dst_set_expires(&rt->dst, 0);
1775 static int ip_rt_bug(struct sk_buff *skb)
1777 pr_debug("%s: %pI4 -> %pI4, %s\n",
1778 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1779 skb->dev ? skb->dev->name : "?");
1786 We do not cache source address of outgoing interface,
1787 because it is used only by IP RR, TS and SRR options,
1788 so that it out of fast path.
1790 BTW remember: "addr" is allowed to be not aligned
1794 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1798 if (rt_is_output_route(rt))
1799 src = ip_hdr(skb)->saddr;
1801 struct fib_result res;
1807 memset(&fl4, 0, sizeof(fl4));
1808 fl4.daddr = iph->daddr;
1809 fl4.saddr = iph->saddr;
1810 fl4.flowi4_tos = RT_TOS(iph->tos);
1811 fl4.flowi4_oif = rt->dst.dev->ifindex;
1812 fl4.flowi4_iif = skb->dev->ifindex;
1813 fl4.flowi4_mark = skb->mark;
1816 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1817 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1819 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1823 memcpy(addr, &src, 4);
1826 #ifdef CONFIG_IP_ROUTE_CLASSID
1827 static void set_class_tag(struct rtable *rt, u32 tag)
1829 if (!(rt->dst.tclassid & 0xFFFF))
1830 rt->dst.tclassid |= tag & 0xFFFF;
1831 if (!(rt->dst.tclassid & 0xFFFF0000))
1832 rt->dst.tclassid |= tag & 0xFFFF0000;
1836 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1838 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1841 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1843 if (advmss > 65535 - 40)
1844 advmss = 65535 - 40;
1849 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1851 const struct rtable *rt = (const struct rtable *) dst;
1852 unsigned int mtu = rt->rt_pmtu;
1854 if (mtu && time_after_eq(jiffies, rt->dst.expires))
1858 mtu = dst_metric_raw(dst, RTAX_MTU);
1860 if (mtu && rt_is_output_route(rt))
1863 mtu = dst->dev->mtu;
1865 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1867 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1871 if (mtu > IP_MAX_MTU)
1877 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1878 struct fib_info *fi)
1880 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1882 atomic_inc(&fi->fib_clntref);
1884 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1887 static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
1889 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1890 struct fib_nh_exception *fnhe;
1893 hval = (__force u32) daddr;
1894 hval ^= (hval >> 11) ^ (hval >> 22);
1896 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1897 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1898 if (fnhe->fnhe_daddr == daddr) {
1899 if (fnhe->fnhe_pmtu) {
1900 unsigned long expires = fnhe->fnhe_expires;
1901 unsigned long diff = jiffies - expires;
1903 if (time_before(jiffies, expires)) {
1904 rt->rt_pmtu = fnhe->fnhe_pmtu;
1905 dst_set_expires(&rt->dst, diff);
1909 rt->rt_gateway = fnhe->fnhe_gw;
1910 fnhe->fnhe_stamp = jiffies;
1916 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1917 const struct fib_result *res,
1918 struct fib_info *fi, u16 type, u32 itag)
1921 struct fib_nh *nh = &FIB_RES_NH(*res);
1923 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1924 rt->rt_gateway = nh->nh_gw;
1925 if (unlikely(nh->nh_exceptions))
1926 rt_bind_exception(rt, nh, fl4->daddr);
1927 rt_init_metrics(rt, fl4, fi);
1928 #ifdef CONFIG_IP_ROUTE_CLASSID
1929 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1933 #ifdef CONFIG_IP_ROUTE_CLASSID
1934 #ifdef CONFIG_IP_MULTIPLE_TABLES
1935 set_class_tag(rt, res->tclassid);
1937 set_class_tag(rt, itag);
1941 static struct rtable *rt_dst_alloc(struct net_device *dev,
1942 bool nopolicy, bool noxfrm)
1944 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1946 (nopolicy ? DST_NOPOLICY : 0) |
1947 (noxfrm ? DST_NOXFRM : 0));
1950 /* called in rcu_read_lock() section */
1951 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1952 u8 tos, struct net_device *dev, int our)
1956 struct in_device *in_dev = __in_dev_get_rcu(dev);
1960 /* Primary sanity checks. */
1965 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1966 skb->protocol != htons(ETH_P_IP))
1969 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1970 if (ipv4_is_loopback(saddr))
1973 if (ipv4_is_zeronet(saddr)) {
1974 if (!ipv4_is_local_multicast(daddr))
1977 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1982 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1983 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1987 #ifdef CONFIG_IP_ROUTE_CLASSID
1988 rth->dst.tclassid = itag;
1990 rth->dst.output = ip_rt_bug;
1992 rth->rt_key_dst = daddr;
1993 rth->rt_key_src = saddr;
1994 rth->rt_genid = rt_genid(dev_net(dev));
1995 rth->rt_flags = RTCF_MULTICAST;
1996 rth->rt_type = RTN_MULTICAST;
1997 rth->rt_key_tos = tos;
1998 rth->rt_dst = daddr;
1999 rth->rt_src = saddr;
2000 rth->rt_route_iif = dev->ifindex;
2001 rth->rt_iif = dev->ifindex;
2003 rth->rt_mark = skb->mark;
2005 rth->rt_gateway = daddr;
2008 rth->dst.input= ip_local_deliver;
2009 rth->rt_flags |= RTCF_LOCAL;
2012 #ifdef CONFIG_IP_MROUTE
2013 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2014 rth->dst.input = ip_mr_input;
2016 RT_CACHE_STAT_INC(in_slow_mc);
2018 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2019 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2020 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2031 static void ip_handle_martian_source(struct net_device *dev,
2032 struct in_device *in_dev,
2033 struct sk_buff *skb,
2037 RT_CACHE_STAT_INC(in_martian_src);
2038 #ifdef CONFIG_IP_ROUTE_VERBOSE
2039 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2041 * RFC1812 recommendation, if source is martian,
2042 * the only hint is MAC header.
2044 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2045 &daddr, &saddr, dev->name);
2046 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2047 print_hex_dump(KERN_WARNING, "ll header: ",
2048 DUMP_PREFIX_OFFSET, 16, 1,
2049 skb_mac_header(skb),
2050 dev->hard_header_len, true);
2056 /* called in rcu_read_lock() section */
2057 static int __mkroute_input(struct sk_buff *skb,
2058 const struct fib_result *res,
2059 struct in_device *in_dev,
2060 __be32 daddr, __be32 saddr, u32 tos,
2061 struct rtable **result)
2065 struct in_device *out_dev;
2066 unsigned int flags = 0;
2069 /* get a working reference to the output device */
2070 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2071 if (out_dev == NULL) {
2072 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2077 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2078 in_dev->dev, in_dev, &itag);
2080 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2087 flags |= RTCF_DIRECTSRC;
2089 if (out_dev == in_dev && err &&
2090 (IN_DEV_SHARED_MEDIA(out_dev) ||
2091 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2092 flags |= RTCF_DOREDIRECT;
2094 if (skb->protocol != htons(ETH_P_IP)) {
2095 /* Not IP (i.e. ARP). Do not create route, if it is
2096 * invalid for proxy arp. DNAT routes are always valid.
2098 * Proxy arp feature have been extended to allow, ARP
2099 * replies back to the same interface, to support
2100 * Private VLAN switch technologies. See arp.c.
2102 if (out_dev == in_dev &&
2103 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2109 rth = rt_dst_alloc(out_dev->dev,
2110 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2111 IN_DEV_CONF_GET(out_dev, NOXFRM));
2117 rth->rt_key_dst = daddr;
2118 rth->rt_key_src = saddr;
2119 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2120 rth->rt_flags = flags;
2121 rth->rt_type = res->type;
2122 rth->rt_key_tos = tos;
2123 rth->rt_dst = daddr;
2124 rth->rt_src = saddr;
2125 rth->rt_route_iif = in_dev->dev->ifindex;
2126 rth->rt_iif = in_dev->dev->ifindex;
2128 rth->rt_mark = skb->mark;
2130 rth->rt_gateway = daddr;
2133 rth->dst.input = ip_forward;
2134 rth->dst.output = ip_output;
2136 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2144 static int ip_mkroute_input(struct sk_buff *skb,
2145 struct fib_result *res,
2146 const struct flowi4 *fl4,
2147 struct in_device *in_dev,
2148 __be32 daddr, __be32 saddr, u32 tos)
2150 struct rtable *rth = NULL;
2154 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2155 if (res->fi && res->fi->fib_nhs > 1)
2156 fib_select_multipath(res);
2159 /* create a routing cache entry */
2160 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2164 /* put it into the cache */
2165 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2166 rt_genid(dev_net(rth->dst.dev)));
2167 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2169 return PTR_ERR(rth);
2174 * NOTE. We drop all the packets that has local source
2175 * addresses, because every properly looped back packet
2176 * must have correct destination already attached by output routine.
2178 * Such approach solves two big problems:
2179 * 1. Not simplex devices are handled properly.
2180 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2181 * called with rcu_read_lock()
2184 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2185 u8 tos, struct net_device *dev)
2187 struct fib_result res;
2188 struct in_device *in_dev = __in_dev_get_rcu(dev);
2190 unsigned int flags = 0;
2195 struct net *net = dev_net(dev);
2197 /* IP on this device is disabled. */
2202 /* Check for the most weird martians, which can be not detected
2206 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2207 goto martian_source;
2209 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2212 /* Accept zero addresses only to limited broadcast;
2213 * I even do not know to fix it or not. Waiting for complains :-)
2215 if (ipv4_is_zeronet(saddr))
2216 goto martian_source;
2218 if (ipv4_is_zeronet(daddr))
2219 goto martian_destination;
2221 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2222 if (ipv4_is_loopback(daddr))
2223 goto martian_destination;
2225 if (ipv4_is_loopback(saddr))
2226 goto martian_source;
2230 * Now we are ready to route packet.
2233 fl4.flowi4_iif = dev->ifindex;
2234 fl4.flowi4_mark = skb->mark;
2235 fl4.flowi4_tos = tos;
2236 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2239 err = fib_lookup(net, &fl4, &res);
2243 RT_CACHE_STAT_INC(in_slow_tot);
2245 if (res.type == RTN_BROADCAST)
2248 if (res.type == RTN_LOCAL) {
2249 err = fib_validate_source(skb, saddr, daddr, tos,
2250 net->loopback_dev->ifindex,
2251 dev, in_dev, &itag);
2253 goto martian_source_keep_err;
2255 flags |= RTCF_DIRECTSRC;
2259 if (!IN_DEV_FORWARD(in_dev))
2261 if (res.type != RTN_UNICAST)
2262 goto martian_destination;
2264 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2268 if (skb->protocol != htons(ETH_P_IP))
2271 if (!ipv4_is_zeronet(saddr)) {
2272 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2275 goto martian_source_keep_err;
2277 flags |= RTCF_DIRECTSRC;
2279 flags |= RTCF_BROADCAST;
2280 res.type = RTN_BROADCAST;
2281 RT_CACHE_STAT_INC(in_brd);
2284 rth = rt_dst_alloc(net->loopback_dev,
2285 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2289 rth->dst.input= ip_local_deliver;
2290 rth->dst.output= ip_rt_bug;
2291 #ifdef CONFIG_IP_ROUTE_CLASSID
2292 rth->dst.tclassid = itag;
2295 rth->rt_key_dst = daddr;
2296 rth->rt_key_src = saddr;
2297 rth->rt_genid = rt_genid(net);
2298 rth->rt_flags = flags|RTCF_LOCAL;
2299 rth->rt_type = res.type;
2300 rth->rt_key_tos = tos;
2301 rth->rt_dst = daddr;
2302 rth->rt_src = saddr;
2303 rth->rt_route_iif = dev->ifindex;
2304 rth->rt_iif = dev->ifindex;
2306 rth->rt_mark = skb->mark;
2308 rth->rt_gateway = daddr;
2310 if (res.type == RTN_UNREACHABLE) {
2311 rth->dst.input= ip_error;
2312 rth->dst.error= -err;
2313 rth->rt_flags &= ~RTCF_LOCAL;
2315 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2316 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2323 RT_CACHE_STAT_INC(in_no_route);
2324 res.type = RTN_UNREACHABLE;
2330 * Do not cache martian addresses: they should be logged (RFC1812)
2332 martian_destination:
2333 RT_CACHE_STAT_INC(in_martian_dst);
2334 #ifdef CONFIG_IP_ROUTE_VERBOSE
2335 if (IN_DEV_LOG_MARTIANS(in_dev))
2336 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2337 &daddr, &saddr, dev->name);
2350 martian_source_keep_err:
2351 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2355 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2356 u8 tos, struct net_device *dev, bool noref)
2360 int iif = dev->ifindex;
2368 if (!rt_caching(net))
2371 tos &= IPTOS_RT_MASK;
2372 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2374 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2375 rth = rcu_dereference(rth->dst.rt_next)) {
2376 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2377 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2378 (rth->rt_route_iif ^ iif) |
2379 (rth->rt_key_tos ^ tos)) == 0 &&
2380 rth->rt_mark == skb->mark &&
2381 net_eq(dev_net(rth->dst.dev), net) &&
2382 !rt_is_expired(rth)) {
2384 dst_use_noref(&rth->dst, jiffies);
2385 skb_dst_set_noref(skb, &rth->dst);
2387 dst_use(&rth->dst, jiffies);
2388 skb_dst_set(skb, &rth->dst);
2390 RT_CACHE_STAT_INC(in_hit);
2394 RT_CACHE_STAT_INC(in_hlist_search);
2398 /* Multicast recognition logic is moved from route cache to here.
2399 The problem was that too many Ethernet cards have broken/missing
2400 hardware multicast filters :-( As result the host on multicasting
2401 network acquires a lot of useless route cache entries, sort of
2402 SDR messages from all the world. Now we try to get rid of them.
2403 Really, provided software IP multicast filter is organized
2404 reasonably (at least, hashed), it does not result in a slowdown
2405 comparing with route cache reject entries.
2406 Note, that multicast routers are not affected, because
2407 route cache entry is created eventually.
2409 if (ipv4_is_multicast(daddr)) {
2410 struct in_device *in_dev = __in_dev_get_rcu(dev);
2413 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2414 ip_hdr(skb)->protocol);
2416 #ifdef CONFIG_IP_MROUTE
2418 (!ipv4_is_local_multicast(daddr) &&
2419 IN_DEV_MFORWARD(in_dev))
2422 int res = ip_route_input_mc(skb, daddr, saddr,
2431 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2435 EXPORT_SYMBOL(ip_route_input_common);
2437 /* called with rcu_read_lock() */
2438 static struct rtable *__mkroute_output(const struct fib_result *res,
2439 const struct flowi4 *fl4,
2440 __be32 orig_daddr, __be32 orig_saddr,
2441 int orig_oif, __u8 orig_rtos,
2442 struct net_device *dev_out,
2445 struct fib_info *fi = res->fi;
2446 struct in_device *in_dev;
2447 u16 type = res->type;
2450 in_dev = __in_dev_get_rcu(dev_out);
2452 return ERR_PTR(-EINVAL);
2454 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2455 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2456 return ERR_PTR(-EINVAL);
2458 if (ipv4_is_lbcast(fl4->daddr))
2459 type = RTN_BROADCAST;
2460 else if (ipv4_is_multicast(fl4->daddr))
2461 type = RTN_MULTICAST;
2462 else if (ipv4_is_zeronet(fl4->daddr))
2463 return ERR_PTR(-EINVAL);
2465 if (dev_out->flags & IFF_LOOPBACK)
2466 flags |= RTCF_LOCAL;
2468 if (type == RTN_BROADCAST) {
2469 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2471 } else if (type == RTN_MULTICAST) {
2472 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2473 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2475 flags &= ~RTCF_LOCAL;
2476 /* If multicast route do not exist use
2477 * default one, but do not gateway in this case.
2480 if (fi && res->prefixlen < 4)
2484 rth = rt_dst_alloc(dev_out,
2485 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2486 IN_DEV_CONF_GET(in_dev, NOXFRM));
2488 return ERR_PTR(-ENOBUFS);
2490 rth->dst.output = ip_output;
2492 rth->rt_key_dst = orig_daddr;
2493 rth->rt_key_src = orig_saddr;
2494 rth->rt_genid = rt_genid(dev_net(dev_out));
2495 rth->rt_flags = flags;
2496 rth->rt_type = type;
2497 rth->rt_key_tos = orig_rtos;
2498 rth->rt_dst = fl4->daddr;
2499 rth->rt_src = fl4->saddr;
2500 rth->rt_route_iif = 0;
2501 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2502 rth->rt_oif = orig_oif;
2503 rth->rt_mark = fl4->flowi4_mark;
2505 rth->rt_gateway = fl4->daddr;
2508 RT_CACHE_STAT_INC(out_slow_tot);
2510 if (flags & RTCF_LOCAL)
2511 rth->dst.input = ip_local_deliver;
2512 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2513 if (flags & RTCF_LOCAL &&
2514 !(dev_out->flags & IFF_LOOPBACK)) {
2515 rth->dst.output = ip_mc_output;
2516 RT_CACHE_STAT_INC(out_slow_mc);
2518 #ifdef CONFIG_IP_MROUTE
2519 if (type == RTN_MULTICAST) {
2520 if (IN_DEV_MFORWARD(in_dev) &&
2521 !ipv4_is_local_multicast(fl4->daddr)) {
2522 rth->dst.input = ip_mr_input;
2523 rth->dst.output = ip_mc_output;
2529 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2531 if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2532 rth->dst.flags |= DST_NOCACHE;
2538 * Major route resolver routine.
2539 * called with rcu_read_lock();
2542 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2544 struct net_device *dev_out = NULL;
2545 __u8 tos = RT_FL_TOS(fl4);
2546 unsigned int flags = 0;
2547 struct fib_result res;
2557 orig_daddr = fl4->daddr;
2558 orig_saddr = fl4->saddr;
2559 orig_oif = fl4->flowi4_oif;
2561 fl4->flowi4_iif = net->loopback_dev->ifindex;
2562 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2563 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2564 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2568 rth = ERR_PTR(-EINVAL);
2569 if (ipv4_is_multicast(fl4->saddr) ||
2570 ipv4_is_lbcast(fl4->saddr) ||
2571 ipv4_is_zeronet(fl4->saddr))
2574 /* I removed check for oif == dev_out->oif here.
2575 It was wrong for two reasons:
2576 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2577 is assigned to multiple interfaces.
2578 2. Moreover, we are allowed to send packets with saddr
2579 of another iface. --ANK
2582 if (fl4->flowi4_oif == 0 &&
2583 (ipv4_is_multicast(fl4->daddr) ||
2584 ipv4_is_lbcast(fl4->daddr))) {
2585 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2586 dev_out = __ip_dev_find(net, fl4->saddr, false);
2587 if (dev_out == NULL)
2590 /* Special hack: user can direct multicasts
2591 and limited broadcast via necessary interface
2592 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2593 This hack is not just for fun, it allows
2594 vic,vat and friends to work.
2595 They bind socket to loopback, set ttl to zero
2596 and expect that it will work.
2597 From the viewpoint of routing cache they are broken,
2598 because we are not allowed to build multicast path
2599 with loopback source addr (look, routing cache
2600 cannot know, that ttl is zero, so that packet
2601 will not leave this host and route is valid).
2602 Luckily, this hack is good workaround.
2605 fl4->flowi4_oif = dev_out->ifindex;
2609 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2610 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2611 if (!__ip_dev_find(net, fl4->saddr, false))
2617 if (fl4->flowi4_oif) {
2618 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2619 rth = ERR_PTR(-ENODEV);
2620 if (dev_out == NULL)
2623 /* RACE: Check return value of inet_select_addr instead. */
2624 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2625 rth = ERR_PTR(-ENETUNREACH);
2628 if (ipv4_is_local_multicast(fl4->daddr) ||
2629 ipv4_is_lbcast(fl4->daddr)) {
2631 fl4->saddr = inet_select_addr(dev_out, 0,
2636 if (ipv4_is_multicast(fl4->daddr))
2637 fl4->saddr = inet_select_addr(dev_out, 0,
2639 else if (!fl4->daddr)
2640 fl4->saddr = inet_select_addr(dev_out, 0,
2646 fl4->daddr = fl4->saddr;
2648 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2649 dev_out = net->loopback_dev;
2650 fl4->flowi4_oif = net->loopback_dev->ifindex;
2651 res.type = RTN_LOCAL;
2652 flags |= RTCF_LOCAL;
2656 if (fib_lookup(net, fl4, &res)) {
2659 if (fl4->flowi4_oif) {
2660 /* Apparently, routing tables are wrong. Assume,
2661 that the destination is on link.
2664 Because we are allowed to send to iface
2665 even if it has NO routes and NO assigned
2666 addresses. When oif is specified, routing
2667 tables are looked up with only one purpose:
2668 to catch if destination is gatewayed, rather than
2669 direct. Moreover, if MSG_DONTROUTE is set,
2670 we send packet, ignoring both routing tables
2671 and ifaddr state. --ANK
2674 We could make it even if oif is unknown,
2675 likely IPv6, but we do not.
2678 if (fl4->saddr == 0)
2679 fl4->saddr = inet_select_addr(dev_out, 0,
2681 res.type = RTN_UNICAST;
2684 rth = ERR_PTR(-ENETUNREACH);
2688 if (res.type == RTN_LOCAL) {
2690 if (res.fi->fib_prefsrc)
2691 fl4->saddr = res.fi->fib_prefsrc;
2693 fl4->saddr = fl4->daddr;
2695 dev_out = net->loopback_dev;
2696 fl4->flowi4_oif = dev_out->ifindex;
2698 flags |= RTCF_LOCAL;
2702 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2703 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2704 fib_select_multipath(&res);
2707 if (!res.prefixlen &&
2708 res.table->tb_num_default > 1 &&
2709 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2710 fib_select_default(&res);
2713 fl4->saddr = FIB_RES_PREFSRC(net, res);
2715 dev_out = FIB_RES_DEV(res);
2716 fl4->flowi4_oif = dev_out->ifindex;
2720 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2721 tos, dev_out, flags);
2725 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2726 rt_genid(dev_net(dev_out)));
2727 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2735 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2740 if (!rt_caching(net))
2743 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2746 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2747 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2748 if (rth->rt_key_dst == flp4->daddr &&
2749 rth->rt_key_src == flp4->saddr &&
2750 rt_is_output_route(rth) &&
2751 rth->rt_oif == flp4->flowi4_oif &&
2752 rth->rt_mark == flp4->flowi4_mark &&
2753 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2754 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2755 net_eq(dev_net(rth->dst.dev), net) &&
2756 !rt_is_expired(rth)) {
2757 dst_use(&rth->dst, jiffies);
2758 RT_CACHE_STAT_INC(out_hit);
2759 rcu_read_unlock_bh();
2761 flp4->saddr = rth->rt_src;
2763 flp4->daddr = rth->rt_dst;
2766 RT_CACHE_STAT_INC(out_hlist_search);
2768 rcu_read_unlock_bh();
2771 return ip_route_output_slow(net, flp4);
2773 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2775 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2780 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2782 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2784 return mtu ? : dst->dev->mtu;
2787 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2788 struct sk_buff *skb, u32 mtu)
2792 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2793 struct sk_buff *skb)
2797 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2803 static struct dst_ops ipv4_dst_blackhole_ops = {
2805 .protocol = cpu_to_be16(ETH_P_IP),
2806 .destroy = ipv4_dst_destroy,
2807 .check = ipv4_blackhole_dst_check,
2808 .mtu = ipv4_blackhole_mtu,
2809 .default_advmss = ipv4_default_advmss,
2810 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2811 .redirect = ipv4_rt_blackhole_redirect,
2812 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2813 .neigh_lookup = ipv4_neigh_lookup,
2816 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2818 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2819 struct rtable *ort = (struct rtable *) dst_orig;
2822 struct dst_entry *new = &rt->dst;
2825 new->input = dst_discard;
2826 new->output = dst_discard;
2828 new->dev = ort->dst.dev;
2832 rt->rt_key_dst = ort->rt_key_dst;
2833 rt->rt_key_src = ort->rt_key_src;
2834 rt->rt_key_tos = ort->rt_key_tos;
2835 rt->rt_route_iif = ort->rt_route_iif;
2836 rt->rt_iif = ort->rt_iif;
2837 rt->rt_oif = ort->rt_oif;
2838 rt->rt_mark = ort->rt_mark;
2839 rt->rt_pmtu = ort->rt_pmtu;
2841 rt->rt_genid = rt_genid(net);
2842 rt->rt_flags = ort->rt_flags;
2843 rt->rt_type = ort->rt_type;
2844 rt->rt_dst = ort->rt_dst;
2845 rt->rt_src = ort->rt_src;
2846 rt->rt_gateway = ort->rt_gateway;
2849 atomic_inc(&rt->fi->fib_clntref);
2854 dst_release(dst_orig);
2856 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2859 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2862 struct rtable *rt = __ip_route_output_key(net, flp4);
2867 if (flp4->flowi4_proto)
2868 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2869 flowi4_to_flowi(flp4),
2874 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2876 static int rt_fill_info(struct net *net,
2877 struct sk_buff *skb, u32 pid, u32 seq, int event,
2878 int nowait, unsigned int flags)
2880 struct rtable *rt = skb_rtable(skb);
2882 struct nlmsghdr *nlh;
2883 unsigned long expires = 0;
2886 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2890 r = nlmsg_data(nlh);
2891 r->rtm_family = AF_INET;
2892 r->rtm_dst_len = 32;
2894 r->rtm_tos = rt->rt_key_tos;
2895 r->rtm_table = RT_TABLE_MAIN;
2896 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2897 goto nla_put_failure;
2898 r->rtm_type = rt->rt_type;
2899 r->rtm_scope = RT_SCOPE_UNIVERSE;
2900 r->rtm_protocol = RTPROT_UNSPEC;
2901 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2902 if (rt->rt_flags & RTCF_NOTIFY)
2903 r->rtm_flags |= RTM_F_NOTIFY;
2905 if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2906 goto nla_put_failure;
2907 if (rt->rt_key_src) {
2908 r->rtm_src_len = 32;
2909 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2910 goto nla_put_failure;
2913 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2914 goto nla_put_failure;
2915 #ifdef CONFIG_IP_ROUTE_CLASSID
2916 if (rt->dst.tclassid &&
2917 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2918 goto nla_put_failure;
2920 if (!rt_is_input_route(rt) &&
2921 rt->rt_src != rt->rt_key_src) {
2922 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2923 goto nla_put_failure;
2925 if (rt->rt_dst != rt->rt_gateway &&
2926 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2927 goto nla_put_failure;
2929 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2930 goto nla_put_failure;
2933 nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2934 goto nla_put_failure;
2936 error = rt->dst.error;
2937 expires = rt->dst.expires;
2939 if (time_before(jiffies, expires))
2945 if (rt_is_input_route(rt)) {
2946 #ifdef CONFIG_IP_MROUTE
2947 __be32 dst = rt->rt_dst;
2949 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2950 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2951 int err = ipmr_get_route(net, skb,
2952 rt->rt_src, rt->rt_dst,
2958 goto nla_put_failure;
2960 if (err == -EMSGSIZE)
2961 goto nla_put_failure;
2967 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2968 goto nla_put_failure;
2971 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2972 goto nla_put_failure;
2974 return nlmsg_end(skb, nlh);
2977 nlmsg_cancel(skb, nlh);
2981 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2983 struct net *net = sock_net(in_skb->sk);
2985 struct nlattr *tb[RTA_MAX+1];
2986 struct rtable *rt = NULL;
2992 struct sk_buff *skb;
2994 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2998 rtm = nlmsg_data(nlh);
3000 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3006 /* Reserve room for dummy headers, this skb can pass
3007 through good chunk of routing engine.
3009 skb_reset_mac_header(skb);
3010 skb_reset_network_header(skb);
3012 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3013 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3014 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3016 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3017 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3018 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3019 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3022 struct net_device *dev;
3024 dev = __dev_get_by_index(net, iif);
3030 skb->protocol = htons(ETH_P_IP);
3034 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3037 rt = skb_rtable(skb);
3038 if (err == 0 && rt->dst.error)
3039 err = -rt->dst.error;
3041 struct flowi4 fl4 = {
3044 .flowi4_tos = rtm->rtm_tos,
3045 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3046 .flowi4_mark = mark,
3048 rt = ip_route_output_key(net, &fl4);
3058 skb_dst_set(skb, &rt->dst);
3059 if (rtm->rtm_flags & RTM_F_NOTIFY)
3060 rt->rt_flags |= RTCF_NOTIFY;
3062 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3063 RTM_NEWROUTE, 0, 0);
3067 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3076 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3083 net = sock_net(skb->sk);
3088 s_idx = idx = cb->args[1];
3089 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3090 if (!rt_hash_table[h].chain)
3093 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3094 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3095 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3097 if (rt_is_expired(rt))
3099 skb_dst_set_noref(skb, &rt->dst);
3100 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3101 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3102 1, NLM_F_MULTI) <= 0) {
3104 rcu_read_unlock_bh();
3109 rcu_read_unlock_bh();
3118 void ip_rt_multicast_event(struct in_device *in_dev)
3120 rt_cache_flush(dev_net(in_dev->dev), 0);
3123 #ifdef CONFIG_SYSCTL
3124 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3125 void __user *buffer,
3126 size_t *lenp, loff_t *ppos)
3133 memcpy(&ctl, __ctl, sizeof(ctl));
3134 ctl.data = &flush_delay;
3135 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3137 net = (struct net *)__ctl->extra1;
3138 rt_cache_flush(net, flush_delay);
3145 static ctl_table ipv4_route_table[] = {
3147 .procname = "gc_thresh",
3148 .data = &ipv4_dst_ops.gc_thresh,
3149 .maxlen = sizeof(int),
3151 .proc_handler = proc_dointvec,
3154 .procname = "max_size",
3155 .data = &ip_rt_max_size,
3156 .maxlen = sizeof(int),
3158 .proc_handler = proc_dointvec,
3161 /* Deprecated. Use gc_min_interval_ms */
3163 .procname = "gc_min_interval",
3164 .data = &ip_rt_gc_min_interval,
3165 .maxlen = sizeof(int),
3167 .proc_handler = proc_dointvec_jiffies,
3170 .procname = "gc_min_interval_ms",
3171 .data = &ip_rt_gc_min_interval,
3172 .maxlen = sizeof(int),
3174 .proc_handler = proc_dointvec_ms_jiffies,
3177 .procname = "gc_timeout",
3178 .data = &ip_rt_gc_timeout,
3179 .maxlen = sizeof(int),
3181 .proc_handler = proc_dointvec_jiffies,
3184 .procname = "gc_interval",
3185 .data = &ip_rt_gc_interval,
3186 .maxlen = sizeof(int),
3188 .proc_handler = proc_dointvec_jiffies,
3191 .procname = "redirect_load",
3192 .data = &ip_rt_redirect_load,
3193 .maxlen = sizeof(int),
3195 .proc_handler = proc_dointvec,
3198 .procname = "redirect_number",
3199 .data = &ip_rt_redirect_number,
3200 .maxlen = sizeof(int),
3202 .proc_handler = proc_dointvec,
3205 .procname = "redirect_silence",
3206 .data = &ip_rt_redirect_silence,
3207 .maxlen = sizeof(int),
3209 .proc_handler = proc_dointvec,
3212 .procname = "error_cost",
3213 .data = &ip_rt_error_cost,
3214 .maxlen = sizeof(int),
3216 .proc_handler = proc_dointvec,
3219 .procname = "error_burst",
3220 .data = &ip_rt_error_burst,
3221 .maxlen = sizeof(int),
3223 .proc_handler = proc_dointvec,
3226 .procname = "gc_elasticity",
3227 .data = &ip_rt_gc_elasticity,
3228 .maxlen = sizeof(int),
3230 .proc_handler = proc_dointvec,
3233 .procname = "mtu_expires",
3234 .data = &ip_rt_mtu_expires,
3235 .maxlen = sizeof(int),
3237 .proc_handler = proc_dointvec_jiffies,
3240 .procname = "min_pmtu",
3241 .data = &ip_rt_min_pmtu,
3242 .maxlen = sizeof(int),
3244 .proc_handler = proc_dointvec,
3247 .procname = "min_adv_mss",
3248 .data = &ip_rt_min_advmss,
3249 .maxlen = sizeof(int),
3251 .proc_handler = proc_dointvec,
3256 static struct ctl_table ipv4_route_flush_table[] = {
3258 .procname = "flush",
3259 .maxlen = sizeof(int),
3261 .proc_handler = ipv4_sysctl_rtcache_flush,
3266 static __net_init int sysctl_route_net_init(struct net *net)
3268 struct ctl_table *tbl;
3270 tbl = ipv4_route_flush_table;
3271 if (!net_eq(net, &init_net)) {
3272 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3276 tbl[0].extra1 = net;
3278 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3279 if (net->ipv4.route_hdr == NULL)
3284 if (tbl != ipv4_route_flush_table)
3290 static __net_exit void sysctl_route_net_exit(struct net *net)
3292 struct ctl_table *tbl;
3294 tbl = net->ipv4.route_hdr->ctl_table_arg;
3295 unregister_net_sysctl_table(net->ipv4.route_hdr);
3296 BUG_ON(tbl == ipv4_route_flush_table);
3300 static __net_initdata struct pernet_operations sysctl_route_ops = {
3301 .init = sysctl_route_net_init,
3302 .exit = sysctl_route_net_exit,
3306 static __net_init int rt_genid_init(struct net *net)
3308 get_random_bytes(&net->ipv4.rt_genid,
3309 sizeof(net->ipv4.rt_genid));
3310 get_random_bytes(&net->ipv4.dev_addr_genid,
3311 sizeof(net->ipv4.dev_addr_genid));
3315 static __net_initdata struct pernet_operations rt_genid_ops = {
3316 .init = rt_genid_init,
3319 static int __net_init ipv4_inetpeer_init(struct net *net)
3321 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3325 inet_peer_base_init(bp);
3326 net->ipv4.peers = bp;
3330 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3332 struct inet_peer_base *bp = net->ipv4.peers;
3334 net->ipv4.peers = NULL;
3335 inetpeer_invalidate_tree(bp);
3339 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3340 .init = ipv4_inetpeer_init,
3341 .exit = ipv4_inetpeer_exit,
3344 #ifdef CONFIG_IP_ROUTE_CLASSID
3345 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3346 #endif /* CONFIG_IP_ROUTE_CLASSID */
3348 static __initdata unsigned long rhash_entries;
3349 static int __init set_rhash_entries(char *str)
3356 ret = kstrtoul(str, 0, &rhash_entries);
3362 __setup("rhash_entries=", set_rhash_entries);
3364 int __init ip_rt_init(void)
3368 #ifdef CONFIG_IP_ROUTE_CLASSID
3369 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3371 panic("IP: failed to allocate ip_rt_acct\n");
3374 ipv4_dst_ops.kmem_cachep =
3375 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3376 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3378 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3380 if (dst_entries_init(&ipv4_dst_ops) < 0)
3381 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3383 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3384 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3386 rt_hash_table = (struct rt_hash_bucket *)
3387 alloc_large_system_hash("IP route cache",
3388 sizeof(struct rt_hash_bucket),
3390 (totalram_pages >= 128 * 1024) ?
3396 rhash_entries ? 0 : 512 * 1024);
3397 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3398 rt_hash_lock_init();
3400 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3401 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3406 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3407 expires_ljiffies = jiffies;
3408 schedule_delayed_work(&expires_work,
3409 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3411 if (ip_rt_proc_init())
3412 pr_err("Unable to create route proc files\n");
3415 xfrm4_init(ip_rt_max_size);
3417 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3419 #ifdef CONFIG_SYSCTL
3420 register_pernet_subsys(&sysctl_route_ops);
3422 register_pernet_subsys(&rt_genid_ops);
3423 register_pernet_subsys(&ipv4_inetpeer_ops);
3427 #ifdef CONFIG_SYSCTL
3429 * We really need to sanitize the damn ipv4 init order, then all
3430 * this nonsense will go away.
3432 void __init ip_static_sysctl_init(void)
3434 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);