2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #define pr_fmt(fmt) "IPv4: " fmt
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/workqueue.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <linux/slab.h>
95 #include <linux/prefetch.h>
97 #include <net/net_namespace.h>
98 #include <net/protocol.h>
100 #include <net/route.h>
101 #include <net/inetpeer.h>
102 #include <net/sock.h>
103 #include <net/ip_fib.h>
106 #include <net/icmp.h>
107 #include <net/xfrm.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
114 #include <net/secure_seq.h>
116 #define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
119 #define IP_MAX_MTU 0xFFF0
121 #define RT_GC_TIMEOUT (300*HZ)
123 static int ip_rt_max_size;
124 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
125 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
126 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
127 static int ip_rt_redirect_number __read_mostly = 9;
128 static int ip_rt_redirect_load __read_mostly = HZ / 50;
129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130 static int ip_rt_error_cost __read_mostly = HZ;
131 static int ip_rt_error_burst __read_mostly = 5 * HZ;
132 static int ip_rt_gc_elasticity __read_mostly = 8;
133 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135 static int ip_rt_min_advmss __read_mostly = 256;
136 static int rt_chain_length_max __read_mostly = 20;
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
142 * Interface to generic destination cache.
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int ipv4_mtu(const struct dst_entry *dst);
148 static void ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void ipv4_link_failure(struct sk_buff *skb);
151 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
161 struct rtable *rt = (struct rtable *) dst;
162 struct inet_peer *peer;
165 peer = rt_get_peer_create(rt, rt->rt_dst);
167 u32 *old_p = __DST_METRICS_PTR(old);
168 unsigned long prev, new;
171 if (inet_metrics_new(peer))
172 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
174 new = (unsigned long) p;
175 prev = cmpxchg(&dst->_metrics, old, new);
178 p = __DST_METRICS_PTR(prev);
179 if (prev & DST_METRICS_READ_ONLY)
183 fib_info_put(rt->fi);
191 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
195 static struct dst_ops ipv4_dst_ops = {
197 .protocol = cpu_to_be16(ETH_P_IP),
198 .gc = rt_garbage_collect,
199 .check = ipv4_dst_check,
200 .default_advmss = ipv4_default_advmss,
202 .cow_metrics = ipv4_cow_metrics,
203 .destroy = ipv4_dst_destroy,
204 .ifdown = ipv4_dst_ifdown,
205 .negative_advice = ipv4_negative_advice,
206 .link_failure = ipv4_link_failure,
207 .update_pmtu = ip_rt_update_pmtu,
208 .local_out = __ip_local_out,
209 .neigh_lookup = ipv4_neigh_lookup,
212 #define ECN_OR_COST(class) TC_PRIO_##class
214 const __u8 ip_tos2prio[16] = {
216 ECN_OR_COST(BESTEFFORT),
218 ECN_OR_COST(BESTEFFORT),
224 ECN_OR_COST(INTERACTIVE),
226 ECN_OR_COST(INTERACTIVE),
227 TC_PRIO_INTERACTIVE_BULK,
228 ECN_OR_COST(INTERACTIVE_BULK),
229 TC_PRIO_INTERACTIVE_BULK,
230 ECN_OR_COST(INTERACTIVE_BULK)
232 EXPORT_SYMBOL(ip_tos2prio);
238 /* The locking scheme is rather straight forward:
240 * 1) Read-Copy Update protects the buckets of the central route hash.
241 * 2) Only writers remove entries, and they hold the lock
242 * as they look at rtable reference counts.
243 * 3) Only readers acquire references to rtable entries,
244 * they do so with atomic increments and with the
248 struct rt_hash_bucket {
249 struct rtable __rcu *chain;
252 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
253 defined(CONFIG_PROVE_LOCKING)
255 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
256 * The size of this table is a power of two and depends on the number of CPUS.
257 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
259 #ifdef CONFIG_LOCKDEP
260 # define RT_HASH_LOCK_SZ 256
263 # define RT_HASH_LOCK_SZ 4096
265 # define RT_HASH_LOCK_SZ 2048
267 # define RT_HASH_LOCK_SZ 1024
269 # define RT_HASH_LOCK_SZ 512
271 # define RT_HASH_LOCK_SZ 256
275 static spinlock_t *rt_hash_locks;
276 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
278 static __init void rt_hash_lock_init(void)
282 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
285 panic("IP: failed to allocate rt_hash_locks\n");
287 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
288 spin_lock_init(&rt_hash_locks[i]);
291 # define rt_hash_lock_addr(slot) NULL
293 static inline void rt_hash_lock_init(void)
298 static struct rt_hash_bucket *rt_hash_table __read_mostly;
299 static unsigned int rt_hash_mask __read_mostly;
300 static unsigned int rt_hash_log __read_mostly;
302 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
303 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
305 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
308 return jhash_3words((__force u32)daddr, (__force u32)saddr,
313 static inline int rt_genid(struct net *net)
315 return atomic_read(&net->ipv4.rt_genid);
318 #ifdef CONFIG_PROC_FS
319 struct rt_cache_iter_state {
320 struct seq_net_private p;
325 static struct rtable *rt_cache_get_first(struct seq_file *seq)
327 struct rt_cache_iter_state *st = seq->private;
328 struct rtable *r = NULL;
330 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
331 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
334 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
336 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
337 r->rt_genid == st->genid)
339 r = rcu_dereference_bh(r->dst.rt_next);
341 rcu_read_unlock_bh();
346 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
349 struct rt_cache_iter_state *st = seq->private;
351 r = rcu_dereference_bh(r->dst.rt_next);
353 rcu_read_unlock_bh();
355 if (--st->bucket < 0)
357 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
359 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
364 static struct rtable *rt_cache_get_next(struct seq_file *seq,
367 struct rt_cache_iter_state *st = seq->private;
368 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
369 if (dev_net(r->dst.dev) != seq_file_net(seq))
371 if (r->rt_genid == st->genid)
377 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
379 struct rtable *r = rt_cache_get_first(seq);
382 while (pos && (r = rt_cache_get_next(seq, r)))
384 return pos ? NULL : r;
387 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
389 struct rt_cache_iter_state *st = seq->private;
391 return rt_cache_get_idx(seq, *pos - 1);
392 st->genid = rt_genid(seq_file_net(seq));
393 return SEQ_START_TOKEN;
396 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
400 if (v == SEQ_START_TOKEN)
401 r = rt_cache_get_first(seq);
403 r = rt_cache_get_next(seq, v);
408 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
410 if (v && v != SEQ_START_TOKEN)
411 rcu_read_unlock_bh();
414 static int rt_cache_seq_show(struct seq_file *seq, void *v)
416 if (v == SEQ_START_TOKEN)
417 seq_printf(seq, "%-127s\n",
418 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
419 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
422 struct rtable *r = v;
425 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
426 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
427 r->dst.dev ? r->dst.dev->name : "*",
428 (__force u32)r->rt_dst,
429 (__force u32)r->rt_gateway,
430 r->rt_flags, atomic_read(&r->dst.__refcnt),
431 r->dst.__use, 0, (__force u32)r->rt_src,
432 dst_metric_advmss(&r->dst) + 40,
433 dst_metric(&r->dst, RTAX_WINDOW),
434 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
435 dst_metric(&r->dst, RTAX_RTTVAR)),
439 seq_printf(seq, "%*s\n", 127 - len, "");
444 static const struct seq_operations rt_cache_seq_ops = {
445 .start = rt_cache_seq_start,
446 .next = rt_cache_seq_next,
447 .stop = rt_cache_seq_stop,
448 .show = rt_cache_seq_show,
451 static int rt_cache_seq_open(struct inode *inode, struct file *file)
453 return seq_open_net(inode, file, &rt_cache_seq_ops,
454 sizeof(struct rt_cache_iter_state));
457 static const struct file_operations rt_cache_seq_fops = {
458 .owner = THIS_MODULE,
459 .open = rt_cache_seq_open,
462 .release = seq_release_net,
466 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
471 return SEQ_START_TOKEN;
473 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
474 if (!cpu_possible(cpu))
477 return &per_cpu(rt_cache_stat, cpu);
482 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
486 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
487 if (!cpu_possible(cpu))
490 return &per_cpu(rt_cache_stat, cpu);
496 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
501 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
503 struct rt_cache_stat *st = v;
505 if (v == SEQ_START_TOKEN) {
506 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
510 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
511 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
512 dst_entries_get_slow(&ipv4_dst_ops),
535 static const struct seq_operations rt_cpu_seq_ops = {
536 .start = rt_cpu_seq_start,
537 .next = rt_cpu_seq_next,
538 .stop = rt_cpu_seq_stop,
539 .show = rt_cpu_seq_show,
543 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
545 return seq_open(file, &rt_cpu_seq_ops);
548 static const struct file_operations rt_cpu_seq_fops = {
549 .owner = THIS_MODULE,
550 .open = rt_cpu_seq_open,
553 .release = seq_release,
556 #ifdef CONFIG_IP_ROUTE_CLASSID
557 static int rt_acct_proc_show(struct seq_file *m, void *v)
559 struct ip_rt_acct *dst, *src;
562 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
566 for_each_possible_cpu(i) {
567 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
568 for (j = 0; j < 256; j++) {
569 dst[j].o_bytes += src[j].o_bytes;
570 dst[j].o_packets += src[j].o_packets;
571 dst[j].i_bytes += src[j].i_bytes;
572 dst[j].i_packets += src[j].i_packets;
576 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
581 static int rt_acct_proc_open(struct inode *inode, struct file *file)
583 return single_open(file, rt_acct_proc_show, NULL);
586 static const struct file_operations rt_acct_proc_fops = {
587 .owner = THIS_MODULE,
588 .open = rt_acct_proc_open,
591 .release = single_release,
595 static int __net_init ip_rt_do_proc_init(struct net *net)
597 struct proc_dir_entry *pde;
599 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
604 pde = proc_create("rt_cache", S_IRUGO,
605 net->proc_net_stat, &rt_cpu_seq_fops);
609 #ifdef CONFIG_IP_ROUTE_CLASSID
610 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
616 #ifdef CONFIG_IP_ROUTE_CLASSID
618 remove_proc_entry("rt_cache", net->proc_net_stat);
621 remove_proc_entry("rt_cache", net->proc_net);
626 static void __net_exit ip_rt_do_proc_exit(struct net *net)
628 remove_proc_entry("rt_cache", net->proc_net_stat);
629 remove_proc_entry("rt_cache", net->proc_net);
630 #ifdef CONFIG_IP_ROUTE_CLASSID
631 remove_proc_entry("rt_acct", net->proc_net);
635 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
636 .init = ip_rt_do_proc_init,
637 .exit = ip_rt_do_proc_exit,
640 static int __init ip_rt_proc_init(void)
642 return register_pernet_subsys(&ip_rt_proc_ops);
646 static inline int ip_rt_proc_init(void)
650 #endif /* CONFIG_PROC_FS */
652 static inline void rt_free(struct rtable *rt)
654 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
657 static inline void rt_drop(struct rtable *rt)
660 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
663 static inline int rt_fast_clean(struct rtable *rth)
665 /* Kill broadcast/multicast entries very aggresively, if they
666 collide in hash table with more useful entries */
667 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
668 rt_is_input_route(rth) && rth->dst.rt_next;
671 static inline int rt_valuable(struct rtable *rth)
673 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
674 (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
677 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
682 if (atomic_read(&rth->dst.__refcnt))
685 age = jiffies - rth->dst.lastuse;
686 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
687 (age <= tmo2 && rt_valuable(rth)))
693 /* Bits of score are:
695 * 30: not quite useless
696 * 29..0: usage counter
698 static inline u32 rt_score(struct rtable *rt)
700 u32 score = jiffies - rt->dst.lastuse;
702 score = ~score & ~(3<<30);
707 if (rt_is_output_route(rt) ||
708 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
714 static inline bool rt_caching(const struct net *net)
716 return net->ipv4.current_rt_cache_rebuild_count <=
717 net->ipv4.sysctl_rt_cache_rebuild_count;
720 static inline bool compare_hash_inputs(const struct rtable *rt1,
721 const struct rtable *rt2)
723 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
724 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
725 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
728 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
730 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
731 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
732 (rt1->rt_mark ^ rt2->rt_mark) |
733 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
734 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
735 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
738 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
740 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
743 static inline int rt_is_expired(struct rtable *rth)
745 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
749 * Perform a full scan of hash table and free all entries.
750 * Can be called by a softirq or a process.
751 * In the later case, we want to be reschedule if necessary
753 static void rt_do_flush(struct net *net, int process_context)
756 struct rtable *rth, *next;
758 for (i = 0; i <= rt_hash_mask; i++) {
759 struct rtable __rcu **pprev;
762 if (process_context && need_resched())
764 rth = rcu_access_pointer(rt_hash_table[i].chain);
768 spin_lock_bh(rt_hash_lock_addr(i));
771 pprev = &rt_hash_table[i].chain;
772 rth = rcu_dereference_protected(*pprev,
773 lockdep_is_held(rt_hash_lock_addr(i)));
776 next = rcu_dereference_protected(rth->dst.rt_next,
777 lockdep_is_held(rt_hash_lock_addr(i)));
780 net_eq(dev_net(rth->dst.dev), net)) {
781 rcu_assign_pointer(*pprev, next);
782 rcu_assign_pointer(rth->dst.rt_next, list);
785 pprev = &rth->dst.rt_next;
790 spin_unlock_bh(rt_hash_lock_addr(i));
792 for (; list; list = next) {
793 next = rcu_dereference_protected(list->dst.rt_next, 1);
800 * While freeing expired entries, we compute average chain length
801 * and standard deviation, using fixed-point arithmetic.
802 * This to have an estimation of rt_chain_length_max
803 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
804 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
808 #define ONE (1UL << FRACT_BITS)
811 * Given a hash chain and an item in this hash chain,
812 * find if a previous entry has the same hash_inputs
813 * (but differs on tos, mark or oif)
814 * Returns 0 if an alias is found.
815 * Returns ONE if rth has no alias before itself.
817 static int has_noalias(const struct rtable *head, const struct rtable *rth)
819 const struct rtable *aux = head;
822 if (compare_hash_inputs(aux, rth))
824 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
829 static void rt_check_expire(void)
831 static unsigned int rover;
832 unsigned int i = rover, goal;
834 struct rtable __rcu **rthp;
835 unsigned long samples = 0;
836 unsigned long sum = 0, sum2 = 0;
840 delta = jiffies - expires_ljiffies;
841 expires_ljiffies = jiffies;
842 mult = ((u64)delta) << rt_hash_log;
843 if (ip_rt_gc_timeout > 1)
844 do_div(mult, ip_rt_gc_timeout);
845 goal = (unsigned int)mult;
846 if (goal > rt_hash_mask)
847 goal = rt_hash_mask + 1;
848 for (; goal > 0; goal--) {
849 unsigned long tmo = ip_rt_gc_timeout;
850 unsigned long length;
852 i = (i + 1) & rt_hash_mask;
853 rthp = &rt_hash_table[i].chain;
860 if (rcu_dereference_raw(*rthp) == NULL)
863 spin_lock_bh(rt_hash_lock_addr(i));
864 while ((rth = rcu_dereference_protected(*rthp,
865 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
866 prefetch(rth->dst.rt_next);
867 if (rt_is_expired(rth) ||
868 rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
869 *rthp = rth->dst.rt_next;
874 /* We only count entries on a chain with equal
875 * hash inputs once so that entries for
876 * different QOS levels, and other non-hash
877 * input attributes don't unfairly skew the
881 rthp = &rth->dst.rt_next;
882 length += has_noalias(rt_hash_table[i].chain, rth);
884 spin_unlock_bh(rt_hash_lock_addr(i));
886 sum2 += length*length;
889 unsigned long avg = sum / samples;
890 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
891 rt_chain_length_max = max_t(unsigned long,
893 (avg + 4*sd) >> FRACT_BITS);
899 * rt_worker_func() is run in process context.
900 * we call rt_check_expire() to scan part of the hash table
902 static void rt_worker_func(struct work_struct *work)
905 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
909 * Perturbation of rt_genid by a small quantity [1..256]
910 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
911 * many times (2^24) without giving recent rt_genid.
912 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
914 static void rt_cache_invalidate(struct net *net)
916 unsigned char shuffle;
918 get_random_bytes(&shuffle, sizeof(shuffle));
919 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
920 inetpeer_invalidate_family(AF_INET);
924 * delay < 0 : invalidate cache (fast : entries will be deleted later)
925 * delay >= 0 : invalidate & flush cache (can be long)
927 void rt_cache_flush(struct net *net, int delay)
929 rt_cache_invalidate(net);
931 rt_do_flush(net, !in_softirq());
934 /* Flush previous cache invalidated entries from the cache */
935 void rt_cache_flush_batch(struct net *net)
937 rt_do_flush(net, !in_softirq());
940 static void rt_emergency_hash_rebuild(struct net *net)
942 net_warn_ratelimited("Route hash chain too long!\n");
943 rt_cache_invalidate(net);
947 Short description of GC goals.
949 We want to build algorithm, which will keep routing cache
950 at some equilibrium point, when number of aged off entries
951 is kept approximately equal to newly generated ones.
953 Current expiration strength is variable "expire".
954 We try to adjust it dynamically, so that if networking
955 is idle expires is large enough to keep enough of warm entries,
956 and when load increases it reduces to limit cache size.
959 static int rt_garbage_collect(struct dst_ops *ops)
961 static unsigned long expire = RT_GC_TIMEOUT;
962 static unsigned long last_gc;
964 static int equilibrium;
966 struct rtable __rcu **rthp;
967 unsigned long now = jiffies;
969 int entries = dst_entries_get_fast(&ipv4_dst_ops);
972 * Garbage collection is pretty expensive,
973 * do not make it too frequently.
976 RT_CACHE_STAT_INC(gc_total);
978 if (now - last_gc < ip_rt_gc_min_interval &&
979 entries < ip_rt_max_size) {
980 RT_CACHE_STAT_INC(gc_ignored);
984 entries = dst_entries_get_slow(&ipv4_dst_ops);
985 /* Calculate number of entries, which we want to expire now. */
986 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
988 if (equilibrium < ipv4_dst_ops.gc_thresh)
989 equilibrium = ipv4_dst_ops.gc_thresh;
990 goal = entries - equilibrium;
992 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
993 goal = entries - equilibrium;
996 /* We are in dangerous area. Try to reduce cache really
999 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1000 equilibrium = entries - goal;
1003 if (now - last_gc >= ip_rt_gc_min_interval)
1007 equilibrium += goal;
1014 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1015 unsigned long tmo = expire;
1017 k = (k + 1) & rt_hash_mask;
1018 rthp = &rt_hash_table[k].chain;
1019 spin_lock_bh(rt_hash_lock_addr(k));
1020 while ((rth = rcu_dereference_protected(*rthp,
1021 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1022 if (!rt_is_expired(rth) &&
1023 !rt_may_expire(rth, tmo, expire)) {
1025 rthp = &rth->dst.rt_next;
1028 *rthp = rth->dst.rt_next;
1032 spin_unlock_bh(rt_hash_lock_addr(k));
1041 /* Goal is not achieved. We stop process if:
1043 - if expire reduced to zero. Otherwise, expire is halfed.
1044 - if table is not full.
1045 - if we are called from interrupt.
1046 - jiffies check is just fallback/debug loop breaker.
1047 We will not spin here for long time in any case.
1050 RT_CACHE_STAT_INC(gc_goal_miss);
1057 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1059 } while (!in_softirq() && time_before_eq(jiffies, now));
1061 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1063 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1065 net_warn_ratelimited("dst cache overflow\n");
1066 RT_CACHE_STAT_INC(gc_dst_overflow);
1070 expire += ip_rt_gc_min_interval;
1071 if (expire > ip_rt_gc_timeout ||
1072 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1073 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1074 expire = ip_rt_gc_timeout;
1079 * Returns number of entries in a hash chain that have different hash_inputs
1081 static int slow_chain_length(const struct rtable *head)
1084 const struct rtable *rth = head;
1087 length += has_noalias(head, rth);
1088 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1090 return length >> FRACT_BITS;
1093 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1094 struct sk_buff *skb,
1097 struct net_device *dev = dst->dev;
1098 const __be32 *pkey = daddr;
1099 const struct rtable *rt;
1100 struct neighbour *n;
1102 rt = (const struct rtable *) dst;
1104 pkey = (const __be32 *) &rt->rt_gateway;
1106 pkey = &ip_hdr(skb)->daddr;
1108 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1111 return neigh_create(&arp_tbl, pkey, dev);
1114 static int rt_bind_neighbour(struct rtable *rt)
1116 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, NULL, &rt->rt_gateway);
1119 dst_set_neighbour(&rt->dst, n);
1124 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1125 struct sk_buff *skb, int ifindex)
1127 struct rtable *rth, *cand;
1128 struct rtable __rcu **rthp, **candp;
1132 int attempts = !in_softirq();
1136 min_score = ~(u32)0;
1141 if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1143 * If we're not caching, just tell the caller we
1144 * were successful and don't touch the route. The
1145 * caller hold the sole reference to the cache entry, and
1146 * it will be released when the caller is done with it.
1147 * If we drop it here, the callers have no way to resolve routes
1148 * when we're not caching. Instead, just point *rp at rt, so
1149 * the caller gets a single use out of the route
1150 * Note that we do rt_free on this new route entry, so that
1151 * once its refcount hits zero, we are still able to reap it
1153 * Note: To avoid expensive rcu stuff for this uncached dst,
1154 * we set DST_NOCACHE so that dst_release() can free dst without
1155 * waiting a grace period.
1158 rt->dst.flags |= DST_NOCACHE;
1159 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1160 int err = rt_bind_neighbour(rt);
1162 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1164 return ERR_PTR(err);
1171 rthp = &rt_hash_table[hash].chain;
1173 spin_lock_bh(rt_hash_lock_addr(hash));
1174 while ((rth = rcu_dereference_protected(*rthp,
1175 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1176 if (rt_is_expired(rth)) {
1177 *rthp = rth->dst.rt_next;
1181 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1183 *rthp = rth->dst.rt_next;
1185 * Since lookup is lockfree, the deletion
1186 * must be visible to another weakly ordered CPU before
1187 * the insertion at the start of the hash chain.
1189 rcu_assign_pointer(rth->dst.rt_next,
1190 rt_hash_table[hash].chain);
1192 * Since lookup is lockfree, the update writes
1193 * must be ordered for consistency on SMP.
1195 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1197 dst_use(&rth->dst, now);
1198 spin_unlock_bh(rt_hash_lock_addr(hash));
1202 skb_dst_set(skb, &rth->dst);
1206 if (!atomic_read(&rth->dst.__refcnt)) {
1207 u32 score = rt_score(rth);
1209 if (score <= min_score) {
1218 rthp = &rth->dst.rt_next;
1222 /* ip_rt_gc_elasticity used to be average length of chain
1223 * length, when exceeded gc becomes really aggressive.
1225 * The second limit is less certain. At the moment it allows
1226 * only 2 entries per bucket. We will see.
1228 if (chain_length > ip_rt_gc_elasticity) {
1229 *candp = cand->dst.rt_next;
1233 if (chain_length > rt_chain_length_max &&
1234 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1235 struct net *net = dev_net(rt->dst.dev);
1236 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1237 if (!rt_caching(net)) {
1238 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1239 rt->dst.dev->name, num);
1241 rt_emergency_hash_rebuild(net);
1242 spin_unlock_bh(rt_hash_lock_addr(hash));
1244 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1245 ifindex, rt_genid(net));
1250 /* Try to bind route to arp only if it is output
1251 route or unicast forwarding path.
1253 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1254 int err = rt_bind_neighbour(rt);
1256 spin_unlock_bh(rt_hash_lock_addr(hash));
1258 if (err != -ENOBUFS) {
1260 return ERR_PTR(err);
1263 /* Neighbour tables are full and nothing
1264 can be released. Try to shrink route cache,
1265 it is most likely it holds some neighbour records.
1267 if (attempts-- > 0) {
1268 int saved_elasticity = ip_rt_gc_elasticity;
1269 int saved_int = ip_rt_gc_min_interval;
1270 ip_rt_gc_elasticity = 1;
1271 ip_rt_gc_min_interval = 0;
1272 rt_garbage_collect(&ipv4_dst_ops);
1273 ip_rt_gc_min_interval = saved_int;
1274 ip_rt_gc_elasticity = saved_elasticity;
1278 net_warn_ratelimited("Neighbour table overflow\n");
1280 return ERR_PTR(-ENOBUFS);
1284 rt->dst.rt_next = rt_hash_table[hash].chain;
1287 * Since lookup is lockfree, we must make sure
1288 * previous writes to rt are committed to memory
1289 * before making rt visible to other CPUS.
1291 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1293 spin_unlock_bh(rt_hash_lock_addr(hash));
1297 skb_dst_set(skb, &rt->dst);
1301 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1303 static u32 rt_peer_genid(void)
1305 return atomic_read(&__rt_peer_genid);
1308 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1310 struct inet_peer_base *base;
1311 struct inet_peer *peer;
1313 base = inetpeer_base_ptr(rt->_peer);
1317 peer = inet_getpeer_v4(base, daddr, create);
1319 if (!rt_set_peer(rt, peer))
1322 rt->rt_peer_genid = rt_peer_genid();
1327 * Peer allocation may fail only in serious out-of-memory conditions. However
1328 * we still can generate some output.
1329 * Random ID selection looks a bit dangerous because we have no chances to
1330 * select ID being unique in a reasonable period of time.
1331 * But broken packet identifier may be better than no packet at all.
1333 static void ip_select_fb_ident(struct iphdr *iph)
1335 static DEFINE_SPINLOCK(ip_fb_id_lock);
1336 static u32 ip_fallback_id;
1339 spin_lock_bh(&ip_fb_id_lock);
1340 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1341 iph->id = htons(salt & 0xFFFF);
1342 ip_fallback_id = salt;
1343 spin_unlock_bh(&ip_fb_id_lock);
1346 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1348 struct rtable *rt = (struct rtable *) dst;
1350 if (rt && !(rt->dst.flags & DST_NOPEER)) {
1351 struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
1353 /* If peer is attached to destination, it is never detached,
1354 so that we need not to grab a lock to dereference it.
1357 iph->id = htons(inet_getid(peer, more));
1361 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1363 ip_select_fb_ident(iph);
1365 EXPORT_SYMBOL(__ip_select_ident);
1367 static void rt_del(unsigned int hash, struct rtable *rt)
1369 struct rtable __rcu **rthp;
1372 rthp = &rt_hash_table[hash].chain;
1373 spin_lock_bh(rt_hash_lock_addr(hash));
1375 while ((aux = rcu_dereference_protected(*rthp,
1376 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1377 if (aux == rt || rt_is_expired(aux)) {
1378 *rthp = aux->dst.rt_next;
1382 rthp = &aux->dst.rt_next;
1384 spin_unlock_bh(rt_hash_lock_addr(hash));
1387 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1389 struct rtable *rt = (struct rtable *) dst;
1390 __be32 orig_gw = rt->rt_gateway;
1391 struct neighbour *n, *old_n;
1393 dst_confirm(&rt->dst);
1395 rt->rt_gateway = peer->redirect_learned.a4;
1397 n = ipv4_neigh_lookup(&rt->dst, NULL, &rt->rt_gateway);
1399 rt->rt_gateway = orig_gw;
1402 old_n = xchg(&rt->dst._neighbour, n);
1404 neigh_release(old_n);
1405 if (!(n->nud_state & NUD_VALID)) {
1406 neigh_event_send(n, NULL);
1408 rt->rt_flags |= RTCF_REDIRECTED;
1409 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1413 /* called in rcu_read_lock() section */
1414 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1415 __be32 saddr, struct net_device *dev)
1418 struct in_device *in_dev = __in_dev_get_rcu(dev);
1419 __be32 skeys[2] = { saddr, 0 };
1420 int ikeys[2] = { dev->ifindex, 0 };
1421 struct inet_peer *peer;
1428 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1429 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1430 ipv4_is_zeronet(new_gw))
1431 goto reject_redirect;
1433 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1434 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1435 goto reject_redirect;
1436 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1437 goto reject_redirect;
1439 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1440 goto reject_redirect;
1443 for (s = 0; s < 2; s++) {
1444 for (i = 0; i < 2; i++) {
1446 struct rtable __rcu **rthp;
1449 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1451 rthp = &rt_hash_table[hash].chain;
1453 while ((rt = rcu_dereference(*rthp)) != NULL) {
1454 rthp = &rt->dst.rt_next;
1456 if (rt->rt_key_dst != daddr ||
1457 rt->rt_key_src != skeys[s] ||
1458 rt->rt_oif != ikeys[i] ||
1459 rt_is_input_route(rt) ||
1460 rt_is_expired(rt) ||
1461 !net_eq(dev_net(rt->dst.dev), net) ||
1463 rt->dst.dev != dev ||
1464 rt->rt_gateway != old_gw)
1467 peer = rt_get_peer_create(rt, rt->rt_dst);
1469 if (peer->redirect_learned.a4 != new_gw) {
1470 peer->redirect_learned.a4 = new_gw;
1471 atomic_inc(&__rt_peer_genid);
1473 check_peer_redir(&rt->dst, peer);
1481 #ifdef CONFIG_IP_ROUTE_VERBOSE
1482 if (IN_DEV_LOG_MARTIANS(in_dev))
1483 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1484 " Advised path = %pI4 -> %pI4\n",
1485 &old_gw, dev->name, &new_gw,
1491 static bool peer_pmtu_expired(struct inet_peer *peer)
1493 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1496 time_after_eq(jiffies, orig) &&
1497 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1500 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1502 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1505 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1508 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1510 struct rtable *rt = (struct rtable *)dst;
1511 struct dst_entry *ret = dst;
1514 if (dst->obsolete > 0) {
1517 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1518 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1520 rt_genid(dev_net(dst->dev)));
1523 } else if (rt_has_peer(rt)) {
1524 struct inet_peer *peer = rt_peer_ptr(rt);
1525 if (peer_pmtu_expired(peer))
1526 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1534 * 1. The first ip_rt_redirect_number redirects are sent
1535 * with exponential backoff, then we stop sending them at all,
1536 * assuming that the host ignores our redirects.
1537 * 2. If we did not see packets requiring redirects
1538 * during ip_rt_redirect_silence, we assume that the host
1539 * forgot redirected route and start to send redirects again.
1541 * This algorithm is much cheaper and more intelligent than dumb load limiting
1544 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1545 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1548 void ip_rt_send_redirect(struct sk_buff *skb)
1550 struct rtable *rt = skb_rtable(skb);
1551 struct in_device *in_dev;
1552 struct inet_peer *peer;
1556 in_dev = __in_dev_get_rcu(rt->dst.dev);
1557 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1561 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1564 peer = rt_get_peer_create(rt, rt->rt_dst);
1566 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1570 /* No redirected packets during ip_rt_redirect_silence;
1571 * reset the algorithm.
1573 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1574 peer->rate_tokens = 0;
1576 /* Too many ignored redirects; do not send anything
1577 * set dst.rate_last to the last seen redirected packet.
1579 if (peer->rate_tokens >= ip_rt_redirect_number) {
1580 peer->rate_last = jiffies;
1584 /* Check for load limit; set rate_last to the latest sent
1587 if (peer->rate_tokens == 0 ||
1590 (ip_rt_redirect_load << peer->rate_tokens)))) {
1591 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1592 peer->rate_last = jiffies;
1593 ++peer->rate_tokens;
1594 #ifdef CONFIG_IP_ROUTE_VERBOSE
1596 peer->rate_tokens == ip_rt_redirect_number)
1597 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1598 &ip_hdr(skb)->saddr, rt->rt_iif,
1599 &rt->rt_dst, &rt->rt_gateway);
1604 static int ip_error(struct sk_buff *skb)
1606 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
1607 struct rtable *rt = skb_rtable(skb);
1608 struct inet_peer *peer;
1614 net = dev_net(rt->dst.dev);
1615 if (!IN_DEV_FORWARD(in_dev)) {
1616 switch (rt->dst.error) {
1618 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
1622 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1628 switch (rt->dst.error) {
1633 code = ICMP_HOST_UNREACH;
1636 code = ICMP_NET_UNREACH;
1637 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
1640 code = ICMP_PKT_FILTERED;
1644 peer = rt_get_peer_create(rt, rt->rt_dst);
1649 peer->rate_tokens += now - peer->rate_last;
1650 if (peer->rate_tokens > ip_rt_error_burst)
1651 peer->rate_tokens = ip_rt_error_burst;
1652 peer->rate_last = now;
1653 if (peer->rate_tokens >= ip_rt_error_cost)
1654 peer->rate_tokens -= ip_rt_error_cost;
1659 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1661 out: kfree_skb(skb);
1665 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1667 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1671 if (time_before(jiffies, expires)) {
1672 u32 orig_dst_mtu = dst_mtu(dst);
1673 if (peer->pmtu_learned < orig_dst_mtu) {
1674 if (!peer->pmtu_orig)
1675 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1676 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1678 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1679 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1682 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1684 struct rtable *rt = (struct rtable *) dst;
1685 struct inet_peer *peer;
1689 peer = rt_get_peer_create(rt, rt->rt_dst);
1691 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1693 if (mtu < ip_rt_min_pmtu)
1694 mtu = ip_rt_min_pmtu;
1695 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1697 pmtu_expires = jiffies + ip_rt_mtu_expires;
1701 peer->pmtu_learned = mtu;
1702 peer->pmtu_expires = pmtu_expires;
1704 atomic_inc(&__rt_peer_genid);
1705 rt->rt_peer_genid = rt_peer_genid();
1707 check_peer_pmtu(dst, peer);
1711 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1712 int oif, u32 mark, u8 protocol, int flow_flags)
1714 const struct iphdr *iph = (const struct iphdr *)skb->data;
1718 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
1719 protocol, flow_flags | FLOWI_FLAG_PRECOW_METRICS,
1720 iph->daddr, iph->saddr, 0, 0);
1721 rt = __ip_route_output_key(net, &fl4);
1723 ip_rt_update_pmtu(&rt->dst, mtu);
1727 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1729 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1731 const struct inet_sock *inet = inet_sk(sk);
1733 return ipv4_update_pmtu(skb, sock_net(sk), mtu,
1734 sk->sk_bound_dev_if, sk->sk_mark,
1735 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
1736 inet_sk_flowi_flags(sk));
1738 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1740 static void ipv4_validate_peer(struct rtable *rt)
1742 if (rt->rt_peer_genid != rt_peer_genid()) {
1743 struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
1746 check_peer_pmtu(&rt->dst, peer);
1748 if (peer->redirect_learned.a4 &&
1749 peer->redirect_learned.a4 != rt->rt_gateway)
1750 check_peer_redir(&rt->dst, peer);
1753 rt->rt_peer_genid = rt_peer_genid();
1757 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1759 struct rtable *rt = (struct rtable *) dst;
1761 if (rt_is_expired(rt))
1763 ipv4_validate_peer(rt);
1767 static void ipv4_dst_destroy(struct dst_entry *dst)
1769 struct rtable *rt = (struct rtable *) dst;
1772 fib_info_put(rt->fi);
1775 if (rt_has_peer(rt)) {
1776 struct inet_peer *peer = rt_peer_ptr(rt);
1782 static void ipv4_link_failure(struct sk_buff *skb)
1786 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1788 rt = skb_rtable(skb);
1789 if (rt && rt_has_peer(rt)) {
1790 struct inet_peer *peer = rt_peer_ptr(rt);
1791 if (peer_pmtu_cleaned(peer))
1792 dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
1796 static int ip_rt_bug(struct sk_buff *skb)
1798 pr_debug("%s: %pI4 -> %pI4, %s\n",
1799 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1800 skb->dev ? skb->dev->name : "?");
1807 We do not cache source address of outgoing interface,
1808 because it is used only by IP RR, TS and SRR options,
1809 so that it out of fast path.
1811 BTW remember: "addr" is allowed to be not aligned
1815 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1819 if (rt_is_output_route(rt))
1820 src = ip_hdr(skb)->saddr;
1822 struct fib_result res;
1828 memset(&fl4, 0, sizeof(fl4));
1829 fl4.daddr = iph->daddr;
1830 fl4.saddr = iph->saddr;
1831 fl4.flowi4_tos = RT_TOS(iph->tos);
1832 fl4.flowi4_oif = rt->dst.dev->ifindex;
1833 fl4.flowi4_iif = skb->dev->ifindex;
1834 fl4.flowi4_mark = skb->mark;
1837 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1838 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1840 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1844 memcpy(addr, &src, 4);
1847 #ifdef CONFIG_IP_ROUTE_CLASSID
1848 static void set_class_tag(struct rtable *rt, u32 tag)
1850 if (!(rt->dst.tclassid & 0xFFFF))
1851 rt->dst.tclassid |= tag & 0xFFFF;
1852 if (!(rt->dst.tclassid & 0xFFFF0000))
1853 rt->dst.tclassid |= tag & 0xFFFF0000;
1857 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1859 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1862 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1864 if (advmss > 65535 - 40)
1865 advmss = 65535 - 40;
1870 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1872 const struct rtable *rt = (const struct rtable *) dst;
1873 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1875 if (mtu && rt_is_output_route(rt))
1878 mtu = dst->dev->mtu;
1880 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1882 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1886 if (mtu > IP_MAX_MTU)
1892 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1893 struct fib_info *fi)
1895 struct inet_peer_base *base;
1896 struct inet_peer *peer;
1899 /* If a peer entry exists for this destination, we must hook
1900 * it up in order to get at cached metrics.
1902 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1905 base = inetpeer_base_ptr(rt->_peer);
1908 peer = inet_getpeer_v4(base, rt->rt_dst, create);
1910 __rt_set_peer(rt, peer);
1911 rt->rt_peer_genid = rt_peer_genid();
1912 if (inet_metrics_new(peer))
1913 memcpy(peer->metrics, fi->fib_metrics,
1914 sizeof(u32) * RTAX_MAX);
1915 dst_init_metrics(&rt->dst, peer->metrics, false);
1917 check_peer_pmtu(&rt->dst, peer);
1919 if (peer->redirect_learned.a4 &&
1920 peer->redirect_learned.a4 != rt->rt_gateway) {
1921 rt->rt_gateway = peer->redirect_learned.a4;
1922 rt->rt_flags |= RTCF_REDIRECTED;
1925 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1927 atomic_inc(&fi->fib_clntref);
1929 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1933 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1934 const struct fib_result *res,
1935 struct fib_info *fi, u16 type, u32 itag)
1937 struct dst_entry *dst = &rt->dst;
1940 if (FIB_RES_GW(*res) &&
1941 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1942 rt->rt_gateway = FIB_RES_GW(*res);
1943 rt_init_metrics(rt, fl4, fi);
1944 #ifdef CONFIG_IP_ROUTE_CLASSID
1945 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1949 if (dst_mtu(dst) > IP_MAX_MTU)
1950 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1952 #ifdef CONFIG_IP_ROUTE_CLASSID
1953 #ifdef CONFIG_IP_MULTIPLE_TABLES
1954 set_class_tag(rt, fib_rules_tclass(res));
1956 set_class_tag(rt, itag);
1960 static struct rtable *rt_dst_alloc(struct net_device *dev,
1961 bool nopolicy, bool noxfrm)
1963 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1965 (nopolicy ? DST_NOPOLICY : 0) |
1966 (noxfrm ? DST_NOXFRM : 0));
1969 /* called in rcu_read_lock() section */
1970 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1971 u8 tos, struct net_device *dev, int our)
1975 struct in_device *in_dev = __in_dev_get_rcu(dev);
1979 /* Primary sanity checks. */
1984 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1985 skb->protocol != htons(ETH_P_IP))
1988 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1989 if (ipv4_is_loopback(saddr))
1992 if (ipv4_is_zeronet(saddr)) {
1993 if (!ipv4_is_local_multicast(daddr))
1996 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2001 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2002 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2006 #ifdef CONFIG_IP_ROUTE_CLASSID
2007 rth->dst.tclassid = itag;
2009 rth->dst.output = ip_rt_bug;
2011 rth->rt_key_dst = daddr;
2012 rth->rt_key_src = saddr;
2013 rth->rt_genid = rt_genid(dev_net(dev));
2014 rth->rt_flags = RTCF_MULTICAST;
2015 rth->rt_type = RTN_MULTICAST;
2016 rth->rt_key_tos = tos;
2017 rth->rt_dst = daddr;
2018 rth->rt_src = saddr;
2019 rth->rt_route_iif = dev->ifindex;
2020 rth->rt_iif = dev->ifindex;
2022 rth->rt_mark = skb->mark;
2023 rth->rt_gateway = daddr;
2024 rth->rt_peer_genid = 0;
2025 rt_init_peer(rth, dev_net(dev)->ipv4.peers);
2028 rth->dst.input= ip_local_deliver;
2029 rth->rt_flags |= RTCF_LOCAL;
2032 #ifdef CONFIG_IP_MROUTE
2033 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2034 rth->dst.input = ip_mr_input;
2036 RT_CACHE_STAT_INC(in_slow_mc);
2038 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2039 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2040 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2051 static void ip_handle_martian_source(struct net_device *dev,
2052 struct in_device *in_dev,
2053 struct sk_buff *skb,
2057 RT_CACHE_STAT_INC(in_martian_src);
2058 #ifdef CONFIG_IP_ROUTE_VERBOSE
2059 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2061 * RFC1812 recommendation, if source is martian,
2062 * the only hint is MAC header.
2064 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2065 &daddr, &saddr, dev->name);
2066 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2067 print_hex_dump(KERN_WARNING, "ll header: ",
2068 DUMP_PREFIX_OFFSET, 16, 1,
2069 skb_mac_header(skb),
2070 dev->hard_header_len, true);
2076 /* called in rcu_read_lock() section */
2077 static int __mkroute_input(struct sk_buff *skb,
2078 const struct fib_result *res,
2079 struct in_device *in_dev,
2080 __be32 daddr, __be32 saddr, u32 tos,
2081 struct rtable **result)
2085 struct in_device *out_dev;
2086 unsigned int flags = 0;
2089 /* get a working reference to the output device */
2090 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2091 if (out_dev == NULL) {
2092 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2097 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2098 in_dev->dev, in_dev, &itag);
2100 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2107 flags |= RTCF_DIRECTSRC;
2109 if (out_dev == in_dev && err &&
2110 (IN_DEV_SHARED_MEDIA(out_dev) ||
2111 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2112 flags |= RTCF_DOREDIRECT;
2114 if (skb->protocol != htons(ETH_P_IP)) {
2115 /* Not IP (i.e. ARP). Do not create route, if it is
2116 * invalid for proxy arp. DNAT routes are always valid.
2118 * Proxy arp feature have been extended to allow, ARP
2119 * replies back to the same interface, to support
2120 * Private VLAN switch technologies. See arp.c.
2122 if (out_dev == in_dev &&
2123 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2129 rth = rt_dst_alloc(out_dev->dev,
2130 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2131 IN_DEV_CONF_GET(out_dev, NOXFRM));
2137 rth->rt_key_dst = daddr;
2138 rth->rt_key_src = saddr;
2139 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2140 rth->rt_flags = flags;
2141 rth->rt_type = res->type;
2142 rth->rt_key_tos = tos;
2143 rth->rt_dst = daddr;
2144 rth->rt_src = saddr;
2145 rth->rt_route_iif = in_dev->dev->ifindex;
2146 rth->rt_iif = in_dev->dev->ifindex;
2148 rth->rt_mark = skb->mark;
2149 rth->rt_gateway = daddr;
2150 rth->rt_peer_genid = 0;
2151 rt_init_peer(rth, &res->table->tb_peers);
2154 rth->dst.input = ip_forward;
2155 rth->dst.output = ip_output;
2157 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2165 static int ip_mkroute_input(struct sk_buff *skb,
2166 struct fib_result *res,
2167 const struct flowi4 *fl4,
2168 struct in_device *in_dev,
2169 __be32 daddr, __be32 saddr, u32 tos)
2171 struct rtable *rth = NULL;
2175 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2176 if (res->fi && res->fi->fib_nhs > 1)
2177 fib_select_multipath(res);
2180 /* create a routing cache entry */
2181 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2185 /* put it into the cache */
2186 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2187 rt_genid(dev_net(rth->dst.dev)));
2188 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2190 return PTR_ERR(rth);
2195 * NOTE. We drop all the packets that has local source
2196 * addresses, because every properly looped back packet
2197 * must have correct destination already attached by output routine.
2199 * Such approach solves two big problems:
2200 * 1. Not simplex devices are handled properly.
2201 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2202 * called with rcu_read_lock()
2205 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2206 u8 tos, struct net_device *dev)
2208 struct fib_result res;
2209 struct in_device *in_dev = __in_dev_get_rcu(dev);
2211 unsigned int flags = 0;
2216 struct net *net = dev_net(dev);
2218 /* IP on this device is disabled. */
2223 /* Check for the most weird martians, which can be not detected
2227 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2228 goto martian_source;
2230 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2233 /* Accept zero addresses only to limited broadcast;
2234 * I even do not know to fix it or not. Waiting for complains :-)
2236 if (ipv4_is_zeronet(saddr))
2237 goto martian_source;
2239 if (ipv4_is_zeronet(daddr))
2240 goto martian_destination;
2242 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
2243 if (ipv4_is_loopback(daddr))
2244 goto martian_destination;
2246 if (ipv4_is_loopback(saddr))
2247 goto martian_source;
2251 * Now we are ready to route packet.
2254 fl4.flowi4_iif = dev->ifindex;
2255 fl4.flowi4_mark = skb->mark;
2256 fl4.flowi4_tos = tos;
2257 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2260 err = fib_lookup(net, &fl4, &res);
2264 RT_CACHE_STAT_INC(in_slow_tot);
2266 if (res.type == RTN_BROADCAST)
2269 if (res.type == RTN_LOCAL) {
2270 err = fib_validate_source(skb, saddr, daddr, tos,
2271 net->loopback_dev->ifindex,
2272 dev, in_dev, &itag);
2274 goto martian_source_keep_err;
2276 flags |= RTCF_DIRECTSRC;
2280 if (!IN_DEV_FORWARD(in_dev))
2282 if (res.type != RTN_UNICAST)
2283 goto martian_destination;
2285 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2289 if (skb->protocol != htons(ETH_P_IP))
2292 if (!ipv4_is_zeronet(saddr)) {
2293 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2296 goto martian_source_keep_err;
2298 flags |= RTCF_DIRECTSRC;
2300 flags |= RTCF_BROADCAST;
2301 res.type = RTN_BROADCAST;
2302 RT_CACHE_STAT_INC(in_brd);
2305 rth = rt_dst_alloc(net->loopback_dev,
2306 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2310 rth->dst.input= ip_local_deliver;
2311 rth->dst.output= ip_rt_bug;
2312 #ifdef CONFIG_IP_ROUTE_CLASSID
2313 rth->dst.tclassid = itag;
2316 rth->rt_key_dst = daddr;
2317 rth->rt_key_src = saddr;
2318 rth->rt_genid = rt_genid(net);
2319 rth->rt_flags = flags|RTCF_LOCAL;
2320 rth->rt_type = res.type;
2321 rth->rt_key_tos = tos;
2322 rth->rt_dst = daddr;
2323 rth->rt_src = saddr;
2324 rth->rt_route_iif = dev->ifindex;
2325 rth->rt_iif = dev->ifindex;
2327 rth->rt_mark = skb->mark;
2328 rth->rt_gateway = daddr;
2329 rth->rt_peer_genid = 0;
2330 rt_init_peer(rth, net->ipv4.peers);
2332 if (res.type == RTN_UNREACHABLE) {
2333 rth->dst.input= ip_error;
2334 rth->dst.error= -err;
2335 rth->rt_flags &= ~RTCF_LOCAL;
2337 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2338 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2345 RT_CACHE_STAT_INC(in_no_route);
2346 res.type = RTN_UNREACHABLE;
2352 * Do not cache martian addresses: they should be logged (RFC1812)
2354 martian_destination:
2355 RT_CACHE_STAT_INC(in_martian_dst);
2356 #ifdef CONFIG_IP_ROUTE_VERBOSE
2357 if (IN_DEV_LOG_MARTIANS(in_dev))
2358 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2359 &daddr, &saddr, dev->name);
2372 martian_source_keep_err:
2373 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2377 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2378 u8 tos, struct net_device *dev, bool noref)
2382 int iif = dev->ifindex;
2390 if (!rt_caching(net))
2393 tos &= IPTOS_RT_MASK;
2394 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2396 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2397 rth = rcu_dereference(rth->dst.rt_next)) {
2398 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2399 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2400 (rth->rt_route_iif ^ iif) |
2401 (rth->rt_key_tos ^ tos)) == 0 &&
2402 rth->rt_mark == skb->mark &&
2403 net_eq(dev_net(rth->dst.dev), net) &&
2404 !rt_is_expired(rth)) {
2405 ipv4_validate_peer(rth);
2407 dst_use_noref(&rth->dst, jiffies);
2408 skb_dst_set_noref(skb, &rth->dst);
2410 dst_use(&rth->dst, jiffies);
2411 skb_dst_set(skb, &rth->dst);
2413 RT_CACHE_STAT_INC(in_hit);
2417 RT_CACHE_STAT_INC(in_hlist_search);
2421 /* Multicast recognition logic is moved from route cache to here.
2422 The problem was that too many Ethernet cards have broken/missing
2423 hardware multicast filters :-( As result the host on multicasting
2424 network acquires a lot of useless route cache entries, sort of
2425 SDR messages from all the world. Now we try to get rid of them.
2426 Really, provided software IP multicast filter is organized
2427 reasonably (at least, hashed), it does not result in a slowdown
2428 comparing with route cache reject entries.
2429 Note, that multicast routers are not affected, because
2430 route cache entry is created eventually.
2432 if (ipv4_is_multicast(daddr)) {
2433 struct in_device *in_dev = __in_dev_get_rcu(dev);
2436 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2437 ip_hdr(skb)->protocol);
2439 #ifdef CONFIG_IP_MROUTE
2441 (!ipv4_is_local_multicast(daddr) &&
2442 IN_DEV_MFORWARD(in_dev))
2445 int res = ip_route_input_mc(skb, daddr, saddr,
2454 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2458 EXPORT_SYMBOL(ip_route_input_common);
2460 /* called with rcu_read_lock() */
2461 static struct rtable *__mkroute_output(const struct fib_result *res,
2462 const struct flowi4 *fl4,
2463 __be32 orig_daddr, __be32 orig_saddr,
2464 int orig_oif, __u8 orig_rtos,
2465 struct net_device *dev_out,
2468 struct fib_info *fi = res->fi;
2469 struct in_device *in_dev;
2470 u16 type = res->type;
2473 in_dev = __in_dev_get_rcu(dev_out);
2475 return ERR_PTR(-EINVAL);
2477 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2478 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2479 return ERR_PTR(-EINVAL);
2481 if (ipv4_is_lbcast(fl4->daddr))
2482 type = RTN_BROADCAST;
2483 else if (ipv4_is_multicast(fl4->daddr))
2484 type = RTN_MULTICAST;
2485 else if (ipv4_is_zeronet(fl4->daddr))
2486 return ERR_PTR(-EINVAL);
2488 if (dev_out->flags & IFF_LOOPBACK)
2489 flags |= RTCF_LOCAL;
2491 if (type == RTN_BROADCAST) {
2492 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2494 } else if (type == RTN_MULTICAST) {
2495 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2496 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2498 flags &= ~RTCF_LOCAL;
2499 /* If multicast route do not exist use
2500 * default one, but do not gateway in this case.
2503 if (fi && res->prefixlen < 4)
2507 rth = rt_dst_alloc(dev_out,
2508 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2509 IN_DEV_CONF_GET(in_dev, NOXFRM));
2511 return ERR_PTR(-ENOBUFS);
2513 rth->dst.output = ip_output;
2515 rth->rt_key_dst = orig_daddr;
2516 rth->rt_key_src = orig_saddr;
2517 rth->rt_genid = rt_genid(dev_net(dev_out));
2518 rth->rt_flags = flags;
2519 rth->rt_type = type;
2520 rth->rt_key_tos = orig_rtos;
2521 rth->rt_dst = fl4->daddr;
2522 rth->rt_src = fl4->saddr;
2523 rth->rt_route_iif = 0;
2524 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2525 rth->rt_oif = orig_oif;
2526 rth->rt_mark = fl4->flowi4_mark;
2527 rth->rt_gateway = fl4->daddr;
2528 rth->rt_peer_genid = 0;
2529 rt_init_peer(rth, (res->table ?
2530 &res->table->tb_peers :
2531 dev_net(dev_out)->ipv4.peers));
2534 RT_CACHE_STAT_INC(out_slow_tot);
2536 if (flags & RTCF_LOCAL)
2537 rth->dst.input = ip_local_deliver;
2538 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2539 if (flags & RTCF_LOCAL &&
2540 !(dev_out->flags & IFF_LOOPBACK)) {
2541 rth->dst.output = ip_mc_output;
2542 RT_CACHE_STAT_INC(out_slow_mc);
2544 #ifdef CONFIG_IP_MROUTE
2545 if (type == RTN_MULTICAST) {
2546 if (IN_DEV_MFORWARD(in_dev) &&
2547 !ipv4_is_local_multicast(fl4->daddr)) {
2548 rth->dst.input = ip_mr_input;
2549 rth->dst.output = ip_mc_output;
2555 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2557 if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2558 rth->dst.flags |= DST_NOCACHE;
2564 * Major route resolver routine.
2565 * called with rcu_read_lock();
2568 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2570 struct net_device *dev_out = NULL;
2571 __u8 tos = RT_FL_TOS(fl4);
2572 unsigned int flags = 0;
2573 struct fib_result res;
2581 #ifdef CONFIG_IP_MULTIPLE_TABLES
2585 orig_daddr = fl4->daddr;
2586 orig_saddr = fl4->saddr;
2587 orig_oif = fl4->flowi4_oif;
2589 fl4->flowi4_iif = net->loopback_dev->ifindex;
2590 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2591 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2592 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2596 rth = ERR_PTR(-EINVAL);
2597 if (ipv4_is_multicast(fl4->saddr) ||
2598 ipv4_is_lbcast(fl4->saddr) ||
2599 ipv4_is_zeronet(fl4->saddr))
2602 /* I removed check for oif == dev_out->oif here.
2603 It was wrong for two reasons:
2604 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2605 is assigned to multiple interfaces.
2606 2. Moreover, we are allowed to send packets with saddr
2607 of another iface. --ANK
2610 if (fl4->flowi4_oif == 0 &&
2611 (ipv4_is_multicast(fl4->daddr) ||
2612 ipv4_is_lbcast(fl4->daddr))) {
2613 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2614 dev_out = __ip_dev_find(net, fl4->saddr, false);
2615 if (dev_out == NULL)
2618 /* Special hack: user can direct multicasts
2619 and limited broadcast via necessary interface
2620 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2621 This hack is not just for fun, it allows
2622 vic,vat and friends to work.
2623 They bind socket to loopback, set ttl to zero
2624 and expect that it will work.
2625 From the viewpoint of routing cache they are broken,
2626 because we are not allowed to build multicast path
2627 with loopback source addr (look, routing cache
2628 cannot know, that ttl is zero, so that packet
2629 will not leave this host and route is valid).
2630 Luckily, this hack is good workaround.
2633 fl4->flowi4_oif = dev_out->ifindex;
2637 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2638 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2639 if (!__ip_dev_find(net, fl4->saddr, false))
2645 if (fl4->flowi4_oif) {
2646 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2647 rth = ERR_PTR(-ENODEV);
2648 if (dev_out == NULL)
2651 /* RACE: Check return value of inet_select_addr instead. */
2652 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2653 rth = ERR_PTR(-ENETUNREACH);
2656 if (ipv4_is_local_multicast(fl4->daddr) ||
2657 ipv4_is_lbcast(fl4->daddr)) {
2659 fl4->saddr = inet_select_addr(dev_out, 0,
2664 if (ipv4_is_multicast(fl4->daddr))
2665 fl4->saddr = inet_select_addr(dev_out, 0,
2667 else if (!fl4->daddr)
2668 fl4->saddr = inet_select_addr(dev_out, 0,
2674 fl4->daddr = fl4->saddr;
2676 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2677 dev_out = net->loopback_dev;
2678 fl4->flowi4_oif = net->loopback_dev->ifindex;
2679 res.type = RTN_LOCAL;
2680 flags |= RTCF_LOCAL;
2684 if (fib_lookup(net, fl4, &res)) {
2687 if (fl4->flowi4_oif) {
2688 /* Apparently, routing tables are wrong. Assume,
2689 that the destination is on link.
2692 Because we are allowed to send to iface
2693 even if it has NO routes and NO assigned
2694 addresses. When oif is specified, routing
2695 tables are looked up with only one purpose:
2696 to catch if destination is gatewayed, rather than
2697 direct. Moreover, if MSG_DONTROUTE is set,
2698 we send packet, ignoring both routing tables
2699 and ifaddr state. --ANK
2702 We could make it even if oif is unknown,
2703 likely IPv6, but we do not.
2706 if (fl4->saddr == 0)
2707 fl4->saddr = inet_select_addr(dev_out, 0,
2709 res.type = RTN_UNICAST;
2712 rth = ERR_PTR(-ENETUNREACH);
2716 if (res.type == RTN_LOCAL) {
2718 if (res.fi->fib_prefsrc)
2719 fl4->saddr = res.fi->fib_prefsrc;
2721 fl4->saddr = fl4->daddr;
2723 dev_out = net->loopback_dev;
2724 fl4->flowi4_oif = dev_out->ifindex;
2726 flags |= RTCF_LOCAL;
2730 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2731 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2732 fib_select_multipath(&res);
2735 if (!res.prefixlen &&
2736 res.table->tb_num_default > 1 &&
2737 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2738 fib_select_default(&res);
2741 fl4->saddr = FIB_RES_PREFSRC(net, res);
2743 dev_out = FIB_RES_DEV(res);
2744 fl4->flowi4_oif = dev_out->ifindex;
2748 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2749 tos, dev_out, flags);
2753 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2754 rt_genid(dev_net(dev_out)));
2755 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2763 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2768 if (!rt_caching(net))
2771 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2774 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2775 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2776 if (rth->rt_key_dst == flp4->daddr &&
2777 rth->rt_key_src == flp4->saddr &&
2778 rt_is_output_route(rth) &&
2779 rth->rt_oif == flp4->flowi4_oif &&
2780 rth->rt_mark == flp4->flowi4_mark &&
2781 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2782 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2783 net_eq(dev_net(rth->dst.dev), net) &&
2784 !rt_is_expired(rth)) {
2785 ipv4_validate_peer(rth);
2786 dst_use(&rth->dst, jiffies);
2787 RT_CACHE_STAT_INC(out_hit);
2788 rcu_read_unlock_bh();
2790 flp4->saddr = rth->rt_src;
2792 flp4->daddr = rth->rt_dst;
2795 RT_CACHE_STAT_INC(out_hlist_search);
2797 rcu_read_unlock_bh();
2800 return ip_route_output_slow(net, flp4);
2802 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2804 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2809 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2811 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2813 return mtu ? : dst->dev->mtu;
2816 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2820 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2826 static struct dst_ops ipv4_dst_blackhole_ops = {
2828 .protocol = cpu_to_be16(ETH_P_IP),
2829 .destroy = ipv4_dst_destroy,
2830 .check = ipv4_blackhole_dst_check,
2831 .mtu = ipv4_blackhole_mtu,
2832 .default_advmss = ipv4_default_advmss,
2833 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2834 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2835 .neigh_lookup = ipv4_neigh_lookup,
2838 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2840 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2841 struct rtable *ort = (struct rtable *) dst_orig;
2844 struct dst_entry *new = &rt->dst;
2847 new->input = dst_discard;
2848 new->output = dst_discard;
2849 dst_copy_metrics(new, &ort->dst);
2851 new->dev = ort->dst.dev;
2855 rt->rt_key_dst = ort->rt_key_dst;
2856 rt->rt_key_src = ort->rt_key_src;
2857 rt->rt_key_tos = ort->rt_key_tos;
2858 rt->rt_route_iif = ort->rt_route_iif;
2859 rt->rt_iif = ort->rt_iif;
2860 rt->rt_oif = ort->rt_oif;
2861 rt->rt_mark = ort->rt_mark;
2863 rt->rt_genid = rt_genid(net);
2864 rt->rt_flags = ort->rt_flags;
2865 rt->rt_type = ort->rt_type;
2866 rt->rt_dst = ort->rt_dst;
2867 rt->rt_src = ort->rt_src;
2868 rt->rt_gateway = ort->rt_gateway;
2869 rt_transfer_peer(rt, ort);
2872 atomic_inc(&rt->fi->fib_clntref);
2877 dst_release(dst_orig);
2879 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2882 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2885 struct rtable *rt = __ip_route_output_key(net, flp4);
2890 if (flp4->flowi4_proto)
2891 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2892 flowi4_to_flowi(flp4),
2897 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2899 static int rt_fill_info(struct net *net,
2900 struct sk_buff *skb, u32 pid, u32 seq, int event,
2901 int nowait, unsigned int flags)
2903 struct rtable *rt = skb_rtable(skb);
2905 struct nlmsghdr *nlh;
2906 unsigned long expires = 0;
2907 u32 id = 0, ts = 0, tsage = 0, error;
2909 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2913 r = nlmsg_data(nlh);
2914 r->rtm_family = AF_INET;
2915 r->rtm_dst_len = 32;
2917 r->rtm_tos = rt->rt_key_tos;
2918 r->rtm_table = RT_TABLE_MAIN;
2919 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2920 goto nla_put_failure;
2921 r->rtm_type = rt->rt_type;
2922 r->rtm_scope = RT_SCOPE_UNIVERSE;
2923 r->rtm_protocol = RTPROT_UNSPEC;
2924 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2925 if (rt->rt_flags & RTCF_NOTIFY)
2926 r->rtm_flags |= RTM_F_NOTIFY;
2928 if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2929 goto nla_put_failure;
2930 if (rt->rt_key_src) {
2931 r->rtm_src_len = 32;
2932 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2933 goto nla_put_failure;
2936 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2937 goto nla_put_failure;
2938 #ifdef CONFIG_IP_ROUTE_CLASSID
2939 if (rt->dst.tclassid &&
2940 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2941 goto nla_put_failure;
2943 if (!rt_is_input_route(rt) &&
2944 rt->rt_src != rt->rt_key_src) {
2945 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2946 goto nla_put_failure;
2948 if (rt->rt_dst != rt->rt_gateway &&
2949 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2950 goto nla_put_failure;
2952 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2953 goto nla_put_failure;
2956 nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2957 goto nla_put_failure;
2959 error = rt->dst.error;
2960 if (rt_has_peer(rt)) {
2961 const struct inet_peer *peer = rt_peer_ptr(rt);
2962 inet_peer_refcheck(peer);
2963 id = atomic_read(&peer->ip_id_count) & 0xffff;
2964 if (peer->tcp_ts_stamp) {
2966 tsage = get_seconds() - peer->tcp_ts_stamp;
2968 expires = ACCESS_ONCE(peer->pmtu_expires);
2970 if (time_before(jiffies, expires))
2977 if (rt_is_input_route(rt)) {
2978 #ifdef CONFIG_IP_MROUTE
2979 __be32 dst = rt->rt_dst;
2981 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2982 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2983 int err = ipmr_get_route(net, skb,
2984 rt->rt_src, rt->rt_dst,
2990 goto nla_put_failure;
2992 if (err == -EMSGSIZE)
2993 goto nla_put_failure;
2999 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3000 goto nla_put_failure;
3003 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3004 expires, error) < 0)
3005 goto nla_put_failure;
3007 return nlmsg_end(skb, nlh);
3010 nlmsg_cancel(skb, nlh);
3014 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3016 struct net *net = sock_net(in_skb->sk);
3018 struct nlattr *tb[RTA_MAX+1];
3019 struct rtable *rt = NULL;
3025 struct sk_buff *skb;
3027 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3031 rtm = nlmsg_data(nlh);
3033 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3039 /* Reserve room for dummy headers, this skb can pass
3040 through good chunk of routing engine.
3042 skb_reset_mac_header(skb);
3043 skb_reset_network_header(skb);
3045 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3046 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3047 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3049 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3050 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3051 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3052 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3055 struct net_device *dev;
3057 dev = __dev_get_by_index(net, iif);
3063 skb->protocol = htons(ETH_P_IP);
3067 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3070 rt = skb_rtable(skb);
3071 if (err == 0 && rt->dst.error)
3072 err = -rt->dst.error;
3074 struct flowi4 fl4 = {
3077 .flowi4_tos = rtm->rtm_tos,
3078 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3079 .flowi4_mark = mark,
3081 rt = ip_route_output_key(net, &fl4);
3091 skb_dst_set(skb, &rt->dst);
3092 if (rtm->rtm_flags & RTM_F_NOTIFY)
3093 rt->rt_flags |= RTCF_NOTIFY;
3095 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3096 RTM_NEWROUTE, 0, 0);
3100 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3109 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3116 net = sock_net(skb->sk);
3121 s_idx = idx = cb->args[1];
3122 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3123 if (!rt_hash_table[h].chain)
3126 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3127 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3128 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3130 if (rt_is_expired(rt))
3132 skb_dst_set_noref(skb, &rt->dst);
3133 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3134 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3135 1, NLM_F_MULTI) <= 0) {
3137 rcu_read_unlock_bh();
3142 rcu_read_unlock_bh();
3151 void ip_rt_multicast_event(struct in_device *in_dev)
3153 rt_cache_flush(dev_net(in_dev->dev), 0);
3156 #ifdef CONFIG_SYSCTL
3157 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3158 void __user *buffer,
3159 size_t *lenp, loff_t *ppos)
3166 memcpy(&ctl, __ctl, sizeof(ctl));
3167 ctl.data = &flush_delay;
3168 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3170 net = (struct net *)__ctl->extra1;
3171 rt_cache_flush(net, flush_delay);
3178 static ctl_table ipv4_route_table[] = {
3180 .procname = "gc_thresh",
3181 .data = &ipv4_dst_ops.gc_thresh,
3182 .maxlen = sizeof(int),
3184 .proc_handler = proc_dointvec,
3187 .procname = "max_size",
3188 .data = &ip_rt_max_size,
3189 .maxlen = sizeof(int),
3191 .proc_handler = proc_dointvec,
3194 /* Deprecated. Use gc_min_interval_ms */
3196 .procname = "gc_min_interval",
3197 .data = &ip_rt_gc_min_interval,
3198 .maxlen = sizeof(int),
3200 .proc_handler = proc_dointvec_jiffies,
3203 .procname = "gc_min_interval_ms",
3204 .data = &ip_rt_gc_min_interval,
3205 .maxlen = sizeof(int),
3207 .proc_handler = proc_dointvec_ms_jiffies,
3210 .procname = "gc_timeout",
3211 .data = &ip_rt_gc_timeout,
3212 .maxlen = sizeof(int),
3214 .proc_handler = proc_dointvec_jiffies,
3217 .procname = "gc_interval",
3218 .data = &ip_rt_gc_interval,
3219 .maxlen = sizeof(int),
3221 .proc_handler = proc_dointvec_jiffies,
3224 .procname = "redirect_load",
3225 .data = &ip_rt_redirect_load,
3226 .maxlen = sizeof(int),
3228 .proc_handler = proc_dointvec,
3231 .procname = "redirect_number",
3232 .data = &ip_rt_redirect_number,
3233 .maxlen = sizeof(int),
3235 .proc_handler = proc_dointvec,
3238 .procname = "redirect_silence",
3239 .data = &ip_rt_redirect_silence,
3240 .maxlen = sizeof(int),
3242 .proc_handler = proc_dointvec,
3245 .procname = "error_cost",
3246 .data = &ip_rt_error_cost,
3247 .maxlen = sizeof(int),
3249 .proc_handler = proc_dointvec,
3252 .procname = "error_burst",
3253 .data = &ip_rt_error_burst,
3254 .maxlen = sizeof(int),
3256 .proc_handler = proc_dointvec,
3259 .procname = "gc_elasticity",
3260 .data = &ip_rt_gc_elasticity,
3261 .maxlen = sizeof(int),
3263 .proc_handler = proc_dointvec,
3266 .procname = "mtu_expires",
3267 .data = &ip_rt_mtu_expires,
3268 .maxlen = sizeof(int),
3270 .proc_handler = proc_dointvec_jiffies,
3273 .procname = "min_pmtu",
3274 .data = &ip_rt_min_pmtu,
3275 .maxlen = sizeof(int),
3277 .proc_handler = proc_dointvec,
3280 .procname = "min_adv_mss",
3281 .data = &ip_rt_min_advmss,
3282 .maxlen = sizeof(int),
3284 .proc_handler = proc_dointvec,
3289 static struct ctl_table ipv4_route_flush_table[] = {
3291 .procname = "flush",
3292 .maxlen = sizeof(int),
3294 .proc_handler = ipv4_sysctl_rtcache_flush,
3299 static __net_init int sysctl_route_net_init(struct net *net)
3301 struct ctl_table *tbl;
3303 tbl = ipv4_route_flush_table;
3304 if (!net_eq(net, &init_net)) {
3305 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3309 tbl[0].extra1 = net;
3311 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3312 if (net->ipv4.route_hdr == NULL)
3317 if (tbl != ipv4_route_flush_table)
3323 static __net_exit void sysctl_route_net_exit(struct net *net)
3325 struct ctl_table *tbl;
3327 tbl = net->ipv4.route_hdr->ctl_table_arg;
3328 unregister_net_sysctl_table(net->ipv4.route_hdr);
3329 BUG_ON(tbl == ipv4_route_flush_table);
3333 static __net_initdata struct pernet_operations sysctl_route_ops = {
3334 .init = sysctl_route_net_init,
3335 .exit = sysctl_route_net_exit,
3339 static __net_init int rt_genid_init(struct net *net)
3341 get_random_bytes(&net->ipv4.rt_genid,
3342 sizeof(net->ipv4.rt_genid));
3343 get_random_bytes(&net->ipv4.dev_addr_genid,
3344 sizeof(net->ipv4.dev_addr_genid));
3348 static __net_initdata struct pernet_operations rt_genid_ops = {
3349 .init = rt_genid_init,
3352 static int __net_init ipv4_inetpeer_init(struct net *net)
3354 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3358 inet_peer_base_init(bp);
3359 net->ipv4.peers = bp;
3363 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3365 struct inet_peer_base *bp = net->ipv4.peers;
3367 net->ipv4.peers = NULL;
3368 inetpeer_invalidate_tree(bp);
3372 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3373 .init = ipv4_inetpeer_init,
3374 .exit = ipv4_inetpeer_exit,
3377 #ifdef CONFIG_IP_ROUTE_CLASSID
3378 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3379 #endif /* CONFIG_IP_ROUTE_CLASSID */
3381 static __initdata unsigned long rhash_entries;
3382 static int __init set_rhash_entries(char *str)
3389 ret = kstrtoul(str, 0, &rhash_entries);
3395 __setup("rhash_entries=", set_rhash_entries);
3397 int __init ip_rt_init(void)
3401 #ifdef CONFIG_IP_ROUTE_CLASSID
3402 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3404 panic("IP: failed to allocate ip_rt_acct\n");
3407 ipv4_dst_ops.kmem_cachep =
3408 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3409 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3411 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3413 if (dst_entries_init(&ipv4_dst_ops) < 0)
3414 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3416 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3417 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3419 rt_hash_table = (struct rt_hash_bucket *)
3420 alloc_large_system_hash("IP route cache",
3421 sizeof(struct rt_hash_bucket),
3423 (totalram_pages >= 128 * 1024) ?
3429 rhash_entries ? 0 : 512 * 1024);
3430 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3431 rt_hash_lock_init();
3433 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3434 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3439 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3440 expires_ljiffies = jiffies;
3441 schedule_delayed_work(&expires_work,
3442 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3444 if (ip_rt_proc_init())
3445 pr_err("Unable to create route proc files\n");
3448 xfrm4_init(ip_rt_max_size);
3450 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3452 #ifdef CONFIG_SYSCTL
3453 register_pernet_subsys(&sysctl_route_ops);
3455 register_pernet_subsys(&rt_genid_ops);
3456 register_pernet_subsys(&ipv4_inetpeer_ops);
3460 #ifdef CONFIG_SYSCTL
3462 * We really need to sanitize the damn ipv4 init order, then all
3463 * this nonsense will go away.
3465 void __init ip_static_sysctl_init(void)
3467 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);