2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #define pr_fmt(fmt) "IPv4: " fmt
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
99 #include <net/ip_fib.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
110 #include <net/secure_seq.h>
112 #define RT_FL_TOS(oldflp4) \
113 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
115 #define IP_MAX_MTU 0xFFF0
117 #define RT_GC_TIMEOUT (300*HZ)
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly = 9;
124 static int ip_rt_redirect_load __read_mostly = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly = HZ;
127 static int ip_rt_error_burst __read_mostly = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly = 8;
129 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly = 256;
134 * Interface to generic destination cache.
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void ipv4_link_failure(struct sk_buff *skb);
142 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb, u32 mtu);
144 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb);
146 static void ipv4_dst_destroy(struct dst_entry *dst);
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
163 static struct dst_ops ipv4_dst_ops = {
165 .protocol = cpu_to_be16(ETH_P_IP),
166 .check = ipv4_dst_check,
167 .default_advmss = ipv4_default_advmss,
169 .cow_metrics = ipv4_cow_metrics,
170 .destroy = ipv4_dst_destroy,
171 .ifdown = ipv4_dst_ifdown,
172 .negative_advice = ipv4_negative_advice,
173 .link_failure = ipv4_link_failure,
174 .update_pmtu = ip_rt_update_pmtu,
175 .redirect = ip_do_redirect,
176 .local_out = __ip_local_out,
177 .neigh_lookup = ipv4_neigh_lookup,
180 #define ECN_OR_COST(class) TC_PRIO_##class
182 const __u8 ip_tos2prio[16] = {
184 ECN_OR_COST(BESTEFFORT),
186 ECN_OR_COST(BESTEFFORT),
192 ECN_OR_COST(INTERACTIVE),
194 ECN_OR_COST(INTERACTIVE),
195 TC_PRIO_INTERACTIVE_BULK,
196 ECN_OR_COST(INTERACTIVE_BULK),
197 TC_PRIO_INTERACTIVE_BULK,
198 ECN_OR_COST(INTERACTIVE_BULK)
200 EXPORT_SYMBOL(ip_tos2prio);
202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
205 #ifdef CONFIG_PROC_FS
206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
210 return SEQ_START_TOKEN;
213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
225 if (v == SEQ_START_TOKEN)
226 seq_printf(seq, "%-127s\n",
227 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
233 static const struct seq_operations rt_cache_seq_ops = {
234 .start = rt_cache_seq_start,
235 .next = rt_cache_seq_next,
236 .stop = rt_cache_seq_stop,
237 .show = rt_cache_seq_show,
240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
242 return seq_open(file, &rt_cache_seq_ops);
245 static const struct file_operations rt_cache_seq_fops = {
246 .owner = THIS_MODULE,
247 .open = rt_cache_seq_open,
250 .release = seq_release,
254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
259 return SEQ_START_TOKEN;
261 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
262 if (!cpu_possible(cpu))
265 return &per_cpu(rt_cache_stat, cpu);
270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
274 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
275 if (!cpu_possible(cpu))
278 return &per_cpu(rt_cache_stat, cpu);
284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
291 struct rt_cache_stat *st = v;
293 if (v == SEQ_START_TOKEN) {
294 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
298 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
299 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300 dst_entries_get_slow(&ipv4_dst_ops),
323 static const struct seq_operations rt_cpu_seq_ops = {
324 .start = rt_cpu_seq_start,
325 .next = rt_cpu_seq_next,
326 .stop = rt_cpu_seq_stop,
327 .show = rt_cpu_seq_show,
331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
333 return seq_open(file, &rt_cpu_seq_ops);
336 static const struct file_operations rt_cpu_seq_fops = {
337 .owner = THIS_MODULE,
338 .open = rt_cpu_seq_open,
341 .release = seq_release,
344 #ifdef CONFIG_IP_ROUTE_CLASSID
345 static int rt_acct_proc_show(struct seq_file *m, void *v)
347 struct ip_rt_acct *dst, *src;
350 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
354 for_each_possible_cpu(i) {
355 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356 for (j = 0; j < 256; j++) {
357 dst[j].o_bytes += src[j].o_bytes;
358 dst[j].o_packets += src[j].o_packets;
359 dst[j].i_bytes += src[j].i_bytes;
360 dst[j].i_packets += src[j].i_packets;
364 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
371 return single_open(file, rt_acct_proc_show, NULL);
374 static const struct file_operations rt_acct_proc_fops = {
375 .owner = THIS_MODULE,
376 .open = rt_acct_proc_open,
379 .release = single_release,
383 static int __net_init ip_rt_do_proc_init(struct net *net)
385 struct proc_dir_entry *pde;
387 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
392 pde = proc_create("rt_cache", S_IRUGO,
393 net->proc_net_stat, &rt_cpu_seq_fops);
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
404 #ifdef CONFIG_IP_ROUTE_CLASSID
406 remove_proc_entry("rt_cache", net->proc_net_stat);
409 remove_proc_entry("rt_cache", net->proc_net);
414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
416 remove_proc_entry("rt_cache", net->proc_net_stat);
417 remove_proc_entry("rt_cache", net->proc_net);
418 #ifdef CONFIG_IP_ROUTE_CLASSID
419 remove_proc_entry("rt_acct", net->proc_net);
423 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
424 .init = ip_rt_do_proc_init,
425 .exit = ip_rt_do_proc_exit,
428 static int __init ip_rt_proc_init(void)
430 return register_pernet_subsys(&ip_rt_proc_ops);
434 static inline int ip_rt_proc_init(void)
438 #endif /* CONFIG_PROC_FS */
440 static inline bool rt_is_expired(const struct rtable *rth)
442 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
445 void rt_cache_flush(struct net *net)
450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
454 struct net_device *dev = dst->dev;
455 const __be32 *pkey = daddr;
456 const struct rtable *rt;
459 rt = (const struct rtable *) dst;
461 pkey = (const __be32 *) &rt->rt_gateway;
463 pkey = &ip_hdr(skb)->daddr;
465 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
468 return neigh_create(&arp_tbl, pkey, dev);
472 * Peer allocation may fail only in serious out-of-memory conditions. However
473 * we still can generate some output.
474 * Random ID selection looks a bit dangerous because we have no chances to
475 * select ID being unique in a reasonable period of time.
476 * But broken packet identifier may be better than no packet at all.
478 static void ip_select_fb_ident(struct iphdr *iph)
480 static DEFINE_SPINLOCK(ip_fb_id_lock);
481 static u32 ip_fallback_id;
484 spin_lock_bh(&ip_fb_id_lock);
485 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
486 iph->id = htons(salt & 0xFFFF);
487 ip_fallback_id = salt;
488 spin_unlock_bh(&ip_fb_id_lock);
491 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
493 struct net *net = dev_net(dst->dev);
494 struct inet_peer *peer;
496 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
498 iph->id = htons(inet_getid(peer, more));
503 ip_select_fb_ident(iph);
505 EXPORT_SYMBOL(__ip_select_ident);
507 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
508 const struct iphdr *iph,
510 u8 prot, u32 mark, int flow_flags)
513 const struct inet_sock *inet = inet_sk(sk);
515 oif = sk->sk_bound_dev_if;
517 tos = RT_CONN_FLAGS(sk);
518 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
520 flowi4_init_output(fl4, oif, mark, tos,
521 RT_SCOPE_UNIVERSE, prot,
523 iph->daddr, iph->saddr, 0, 0);
526 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
527 const struct sock *sk)
529 const struct iphdr *iph = ip_hdr(skb);
530 int oif = skb->dev->ifindex;
531 u8 tos = RT_TOS(iph->tos);
532 u8 prot = iph->protocol;
533 u32 mark = skb->mark;
535 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
538 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
540 const struct inet_sock *inet = inet_sk(sk);
541 const struct ip_options_rcu *inet_opt;
542 __be32 daddr = inet->inet_daddr;
545 inet_opt = rcu_dereference(inet->inet_opt);
546 if (inet_opt && inet_opt->opt.srr)
547 daddr = inet_opt->opt.faddr;
548 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
549 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
550 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
551 inet_sk_flowi_flags(sk),
552 daddr, inet->inet_saddr, 0, 0);
556 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
557 const struct sk_buff *skb)
560 build_skb_flow_key(fl4, skb, sk);
562 build_sk_flow_key(fl4, sk);
565 static inline void rt_free(struct rtable *rt)
567 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
570 static DEFINE_SPINLOCK(fnhe_lock);
572 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
574 struct fib_nh_exception *fnhe, *oldest;
577 oldest = rcu_dereference(hash->chain);
578 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
579 fnhe = rcu_dereference(fnhe->fnhe_next)) {
580 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
583 orig = rcu_dereference(oldest->fnhe_rth);
585 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
591 static inline u32 fnhe_hashfun(__be32 daddr)
595 hval = (__force u32) daddr;
596 hval ^= (hval >> 11) ^ (hval >> 22);
598 return hval & (FNHE_HASH_SIZE - 1);
601 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
602 u32 pmtu, unsigned long expires)
604 struct fnhe_hash_bucket *hash;
605 struct fib_nh_exception *fnhe;
607 u32 hval = fnhe_hashfun(daddr);
609 spin_lock_bh(&fnhe_lock);
611 hash = nh->nh_exceptions;
613 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
616 nh->nh_exceptions = hash;
622 for (fnhe = rcu_dereference(hash->chain); fnhe;
623 fnhe = rcu_dereference(fnhe->fnhe_next)) {
624 if (fnhe->fnhe_daddr == daddr)
633 fnhe->fnhe_pmtu = pmtu;
634 fnhe->fnhe_expires = expires;
637 if (depth > FNHE_RECLAIM_DEPTH)
638 fnhe = fnhe_oldest(hash);
640 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
644 fnhe->fnhe_next = hash->chain;
645 rcu_assign_pointer(hash->chain, fnhe);
647 fnhe->fnhe_daddr = daddr;
649 fnhe->fnhe_pmtu = pmtu;
650 fnhe->fnhe_expires = expires;
653 fnhe->fnhe_stamp = jiffies;
656 spin_unlock_bh(&fnhe_lock);
660 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
663 __be32 new_gw = icmp_hdr(skb)->un.gateway;
664 __be32 old_gw = ip_hdr(skb)->saddr;
665 struct net_device *dev = skb->dev;
666 struct in_device *in_dev;
667 struct fib_result res;
671 switch (icmp_hdr(skb)->code & 7) {
673 case ICMP_REDIR_NETTOS:
674 case ICMP_REDIR_HOST:
675 case ICMP_REDIR_HOSTTOS:
682 if (rt->rt_gateway != old_gw)
685 in_dev = __in_dev_get_rcu(dev);
690 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
691 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
692 ipv4_is_zeronet(new_gw))
693 goto reject_redirect;
695 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
696 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
697 goto reject_redirect;
698 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
699 goto reject_redirect;
701 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
702 goto reject_redirect;
705 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
707 if (!(n->nud_state & NUD_VALID)) {
708 neigh_event_send(n, NULL);
710 if (fib_lookup(net, fl4, &res) == 0) {
711 struct fib_nh *nh = &FIB_RES_NH(res);
713 update_or_create_fnhe(nh, fl4->daddr, new_gw,
717 rt->dst.obsolete = DST_OBSOLETE_KILL;
718 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
725 #ifdef CONFIG_IP_ROUTE_VERBOSE
726 if (IN_DEV_LOG_MARTIANS(in_dev)) {
727 const struct iphdr *iph = (const struct iphdr *) skb->data;
728 __be32 daddr = iph->daddr;
729 __be32 saddr = iph->saddr;
731 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
732 " Advised path = %pI4 -> %pI4\n",
733 &old_gw, dev->name, &new_gw,
740 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
745 rt = (struct rtable *) dst;
747 ip_rt_build_flow_key(&fl4, sk, skb);
748 __ip_do_redirect(rt, skb, &fl4, true);
751 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
753 struct rtable *rt = (struct rtable *)dst;
754 struct dst_entry *ret = dst;
757 if (dst->obsolete > 0) {
760 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
771 * 1. The first ip_rt_redirect_number redirects are sent
772 * with exponential backoff, then we stop sending them at all,
773 * assuming that the host ignores our redirects.
774 * 2. If we did not see packets requiring redirects
775 * during ip_rt_redirect_silence, we assume that the host
776 * forgot redirected route and start to send redirects again.
778 * This algorithm is much cheaper and more intelligent than dumb load limiting
781 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
782 * and "frag. need" (breaks PMTU discovery) in icmp.c.
785 void ip_rt_send_redirect(struct sk_buff *skb)
787 struct rtable *rt = skb_rtable(skb);
788 struct in_device *in_dev;
789 struct inet_peer *peer;
794 in_dev = __in_dev_get_rcu(rt->dst.dev);
795 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
799 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
802 net = dev_net(rt->dst.dev);
803 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
805 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
809 /* No redirected packets during ip_rt_redirect_silence;
810 * reset the algorithm.
812 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
813 peer->rate_tokens = 0;
815 /* Too many ignored redirects; do not send anything
816 * set dst.rate_last to the last seen redirected packet.
818 if (peer->rate_tokens >= ip_rt_redirect_number) {
819 peer->rate_last = jiffies;
823 /* Check for load limit; set rate_last to the latest sent
826 if (peer->rate_tokens == 0 ||
829 (ip_rt_redirect_load << peer->rate_tokens)))) {
830 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
831 peer->rate_last = jiffies;
833 #ifdef CONFIG_IP_ROUTE_VERBOSE
835 peer->rate_tokens == ip_rt_redirect_number)
836 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
837 &ip_hdr(skb)->saddr, inet_iif(skb),
838 &ip_hdr(skb)->daddr, &rt->rt_gateway);
845 static int ip_error(struct sk_buff *skb)
847 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
848 struct rtable *rt = skb_rtable(skb);
849 struct inet_peer *peer;
855 net = dev_net(rt->dst.dev);
856 if (!IN_DEV_FORWARD(in_dev)) {
857 switch (rt->dst.error) {
859 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
863 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
869 switch (rt->dst.error) {
874 code = ICMP_HOST_UNREACH;
877 code = ICMP_NET_UNREACH;
878 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
881 code = ICMP_PKT_FILTERED;
885 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
890 peer->rate_tokens += now - peer->rate_last;
891 if (peer->rate_tokens > ip_rt_error_burst)
892 peer->rate_tokens = ip_rt_error_burst;
893 peer->rate_last = now;
894 if (peer->rate_tokens >= ip_rt_error_cost)
895 peer->rate_tokens -= ip_rt_error_cost;
901 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
907 static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
909 struct fib_result res;
911 if (mtu < ip_rt_min_pmtu)
912 mtu = ip_rt_min_pmtu;
915 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
916 struct fib_nh *nh = &FIB_RES_NH(res);
918 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
919 jiffies + ip_rt_mtu_expires);
925 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
926 struct sk_buff *skb, u32 mtu)
928 struct rtable *rt = (struct rtable *) dst;
931 ip_rt_build_flow_key(&fl4, sk, skb);
932 mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
935 dst->obsolete = DST_OBSOLETE_KILL;
938 rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires);
942 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
943 int oif, u32 mark, u8 protocol, int flow_flags)
945 const struct iphdr *iph = (const struct iphdr *) skb->data;
949 __build_flow_key(&fl4, NULL, iph, oif,
950 RT_TOS(iph->tos), protocol, mark, flow_flags);
951 rt = __ip_route_output_key(net, &fl4);
953 __ip_rt_update_pmtu(rt, &fl4, mtu);
957 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
959 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
961 const struct iphdr *iph = (const struct iphdr *) skb->data;
965 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
966 rt = __ip_route_output_key(sock_net(sk), &fl4);
968 __ip_rt_update_pmtu(rt, &fl4, mtu);
972 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
974 void ipv4_redirect(struct sk_buff *skb, struct net *net,
975 int oif, u32 mark, u8 protocol, int flow_flags)
977 const struct iphdr *iph = (const struct iphdr *) skb->data;
981 __build_flow_key(&fl4, NULL, iph, oif,
982 RT_TOS(iph->tos), protocol, mark, flow_flags);
983 rt = __ip_route_output_key(net, &fl4);
985 __ip_do_redirect(rt, skb, &fl4, false);
989 EXPORT_SYMBOL_GPL(ipv4_redirect);
991 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
993 const struct iphdr *iph = (const struct iphdr *) skb->data;
997 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
998 rt = __ip_route_output_key(sock_net(sk), &fl4);
1000 __ip_do_redirect(rt, skb, &fl4, false);
1004 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1006 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1008 struct rtable *rt = (struct rtable *) dst;
1010 /* All IPV4 dsts are created with ->obsolete set to the value
1011 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1012 * into this function always.
1014 * When a PMTU/redirect information update invalidates a
1015 * route, this is indicated by setting obsolete to
1016 * DST_OBSOLETE_KILL.
1018 if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1023 static void ipv4_link_failure(struct sk_buff *skb)
1027 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1029 rt = skb_rtable(skb);
1031 dst_set_expires(&rt->dst, 0);
1034 static int ip_rt_bug(struct sk_buff *skb)
1036 pr_debug("%s: %pI4 -> %pI4, %s\n",
1037 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1038 skb->dev ? skb->dev->name : "?");
1045 We do not cache source address of outgoing interface,
1046 because it is used only by IP RR, TS and SRR options,
1047 so that it out of fast path.
1049 BTW remember: "addr" is allowed to be not aligned
1053 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1057 if (rt_is_output_route(rt))
1058 src = ip_hdr(skb)->saddr;
1060 struct fib_result res;
1066 memset(&fl4, 0, sizeof(fl4));
1067 fl4.daddr = iph->daddr;
1068 fl4.saddr = iph->saddr;
1069 fl4.flowi4_tos = RT_TOS(iph->tos);
1070 fl4.flowi4_oif = rt->dst.dev->ifindex;
1071 fl4.flowi4_iif = skb->dev->ifindex;
1072 fl4.flowi4_mark = skb->mark;
1075 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1076 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1078 src = inet_select_addr(rt->dst.dev,
1079 rt_nexthop(rt, iph->daddr),
1083 memcpy(addr, &src, 4);
1086 #ifdef CONFIG_IP_ROUTE_CLASSID
1087 static void set_class_tag(struct rtable *rt, u32 tag)
1089 if (!(rt->dst.tclassid & 0xFFFF))
1090 rt->dst.tclassid |= tag & 0xFFFF;
1091 if (!(rt->dst.tclassid & 0xFFFF0000))
1092 rt->dst.tclassid |= tag & 0xFFFF0000;
1096 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1098 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1101 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1103 if (advmss > 65535 - 40)
1104 advmss = 65535 - 40;
1109 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1111 const struct rtable *rt = (const struct rtable *) dst;
1112 unsigned int mtu = rt->rt_pmtu;
1114 if (mtu && time_after_eq(jiffies, rt->dst.expires))
1118 mtu = dst_metric_raw(dst, RTAX_MTU);
1120 if (mtu && rt_is_output_route(rt))
1123 mtu = dst->dev->mtu;
1125 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1126 if (rt->rt_gateway && mtu > 576)
1130 if (mtu > IP_MAX_MTU)
1136 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1138 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1139 struct fib_nh_exception *fnhe;
1145 hval = fnhe_hashfun(daddr);
1147 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1148 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1149 if (fnhe->fnhe_daddr == daddr)
1155 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1160 spin_lock_bh(&fnhe_lock);
1162 if (daddr == fnhe->fnhe_daddr) {
1163 struct rtable *orig;
1165 if (fnhe->fnhe_pmtu) {
1166 unsigned long expires = fnhe->fnhe_expires;
1167 unsigned long diff = expires - jiffies;
1169 if (time_before(jiffies, expires)) {
1170 rt->rt_pmtu = fnhe->fnhe_pmtu;
1171 dst_set_expires(&rt->dst, diff);
1174 if (fnhe->fnhe_gw) {
1175 rt->rt_flags |= RTCF_REDIRECTED;
1176 rt->rt_gateway = fnhe->fnhe_gw;
1179 orig = rcu_dereference(fnhe->fnhe_rth);
1180 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1184 fnhe->fnhe_stamp = jiffies;
1187 /* Routes we intend to cache in nexthop exception have
1188 * the DST_NOCACHE bit clear. However, if we are
1189 * unsuccessful at storing this route into the cache
1190 * we really need to set it.
1192 rt->dst.flags |= DST_NOCACHE;
1194 spin_unlock_bh(&fnhe_lock);
1199 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1201 struct rtable *orig, *prev, **p;
1204 if (rt_is_input_route(rt)) {
1205 p = (struct rtable **)&nh->nh_rth_input;
1207 if (!nh->nh_pcpu_rth_output)
1209 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1213 prev = cmpxchg(p, orig, rt);
1218 /* Routes we intend to cache in the FIB nexthop have
1219 * the DST_NOCACHE bit clear. However, if we are
1220 * unsuccessful at storing this route into the cache
1221 * we really need to set it.
1224 rt->dst.flags |= DST_NOCACHE;
1231 static DEFINE_SPINLOCK(rt_uncached_lock);
1232 static LIST_HEAD(rt_uncached_list);
1234 static void rt_add_uncached_list(struct rtable *rt)
1236 spin_lock_bh(&rt_uncached_lock);
1237 list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1238 spin_unlock_bh(&rt_uncached_lock);
1241 static void ipv4_dst_destroy(struct dst_entry *dst)
1243 struct rtable *rt = (struct rtable *) dst;
1245 if (!list_empty(&rt->rt_uncached)) {
1246 spin_lock_bh(&rt_uncached_lock);
1247 list_del(&rt->rt_uncached);
1248 spin_unlock_bh(&rt_uncached_lock);
1252 void rt_flush_dev(struct net_device *dev)
1254 if (!list_empty(&rt_uncached_list)) {
1255 struct net *net = dev_net(dev);
1258 spin_lock_bh(&rt_uncached_lock);
1259 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1260 if (rt->dst.dev != dev)
1262 rt->dst.dev = net->loopback_dev;
1263 dev_hold(rt->dst.dev);
1266 spin_unlock_bh(&rt_uncached_lock);
1270 static bool rt_cache_valid(const struct rtable *rt)
1273 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1277 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1278 const struct fib_result *res,
1279 struct fib_nh_exception *fnhe,
1280 struct fib_info *fi, u16 type, u32 itag)
1282 bool cached = false;
1285 struct fib_nh *nh = &FIB_RES_NH(*res);
1287 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1288 rt->rt_gateway = nh->nh_gw;
1289 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1290 #ifdef CONFIG_IP_ROUTE_CLASSID
1291 rt->dst.tclassid = nh->nh_tclassid;
1294 cached = rt_bind_exception(rt, fnhe, daddr);
1295 else if (!(rt->dst.flags & DST_NOCACHE))
1296 cached = rt_cache_route(nh, rt);
1298 if (unlikely(!cached))
1299 rt_add_uncached_list(rt);
1301 #ifdef CONFIG_IP_ROUTE_CLASSID
1302 #ifdef CONFIG_IP_MULTIPLE_TABLES
1303 set_class_tag(rt, res->tclassid);
1305 set_class_tag(rt, itag);
1309 static struct rtable *rt_dst_alloc(struct net_device *dev,
1310 bool nopolicy, bool noxfrm, bool will_cache)
1312 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1313 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1314 (nopolicy ? DST_NOPOLICY : 0) |
1315 (noxfrm ? DST_NOXFRM : 0));
1318 /* called in rcu_read_lock() section */
1319 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1320 u8 tos, struct net_device *dev, int our)
1323 struct in_device *in_dev = __in_dev_get_rcu(dev);
1327 /* Primary sanity checks. */
1332 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1333 skb->protocol != htons(ETH_P_IP))
1336 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1337 if (ipv4_is_loopback(saddr))
1340 if (ipv4_is_zeronet(saddr)) {
1341 if (!ipv4_is_local_multicast(daddr))
1344 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1349 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1350 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1354 #ifdef CONFIG_IP_ROUTE_CLASSID
1355 rth->dst.tclassid = itag;
1357 rth->dst.output = ip_rt_bug;
1359 rth->rt_genid = rt_genid(dev_net(dev));
1360 rth->rt_flags = RTCF_MULTICAST;
1361 rth->rt_type = RTN_MULTICAST;
1362 rth->rt_is_input= 1;
1365 rth->rt_gateway = 0;
1366 INIT_LIST_HEAD(&rth->rt_uncached);
1368 rth->dst.input= ip_local_deliver;
1369 rth->rt_flags |= RTCF_LOCAL;
1372 #ifdef CONFIG_IP_MROUTE
1373 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1374 rth->dst.input = ip_mr_input;
1376 RT_CACHE_STAT_INC(in_slow_mc);
1378 skb_dst_set(skb, &rth->dst);
1390 static void ip_handle_martian_source(struct net_device *dev,
1391 struct in_device *in_dev,
1392 struct sk_buff *skb,
1396 RT_CACHE_STAT_INC(in_martian_src);
1397 #ifdef CONFIG_IP_ROUTE_VERBOSE
1398 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1400 * RFC1812 recommendation, if source is martian,
1401 * the only hint is MAC header.
1403 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1404 &daddr, &saddr, dev->name);
1405 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1406 print_hex_dump(KERN_WARNING, "ll header: ",
1407 DUMP_PREFIX_OFFSET, 16, 1,
1408 skb_mac_header(skb),
1409 dev->hard_header_len, true);
1415 /* called in rcu_read_lock() section */
1416 static int __mkroute_input(struct sk_buff *skb,
1417 const struct fib_result *res,
1418 struct in_device *in_dev,
1419 __be32 daddr, __be32 saddr, u32 tos)
1423 struct in_device *out_dev;
1424 unsigned int flags = 0;
1428 /* get a working reference to the output device */
1429 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1430 if (out_dev == NULL) {
1431 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1436 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1437 in_dev->dev, in_dev, &itag);
1439 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1445 if (out_dev == in_dev && err &&
1446 (IN_DEV_SHARED_MEDIA(out_dev) ||
1447 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1448 flags |= RTCF_DOREDIRECT;
1450 if (skb->protocol != htons(ETH_P_IP)) {
1451 /* Not IP (i.e. ARP). Do not create route, if it is
1452 * invalid for proxy arp. DNAT routes are always valid.
1454 * Proxy arp feature have been extended to allow, ARP
1455 * replies back to the same interface, to support
1456 * Private VLAN switch technologies. See arp.c.
1458 if (out_dev == in_dev &&
1459 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1468 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1469 if (rt_cache_valid(rth)) {
1470 skb_dst_set_noref(skb, &rth->dst);
1477 rth = rt_dst_alloc(out_dev->dev,
1478 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1479 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1485 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1486 rth->rt_flags = flags;
1487 rth->rt_type = res->type;
1488 rth->rt_is_input = 1;
1491 rth->rt_gateway = 0;
1492 INIT_LIST_HEAD(&rth->rt_uncached);
1494 rth->dst.input = ip_forward;
1495 rth->dst.output = ip_output;
1497 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1498 skb_dst_set(skb, &rth->dst);
1505 static int ip_mkroute_input(struct sk_buff *skb,
1506 struct fib_result *res,
1507 const struct flowi4 *fl4,
1508 struct in_device *in_dev,
1509 __be32 daddr, __be32 saddr, u32 tos)
1511 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1512 if (res->fi && res->fi->fib_nhs > 1)
1513 fib_select_multipath(res);
1516 /* create a routing cache entry */
1517 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1521 * NOTE. We drop all the packets that has local source
1522 * addresses, because every properly looped back packet
1523 * must have correct destination already attached by output routine.
1525 * Such approach solves two big problems:
1526 * 1. Not simplex devices are handled properly.
1527 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1528 * called with rcu_read_lock()
1531 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1532 u8 tos, struct net_device *dev)
1534 struct fib_result res;
1535 struct in_device *in_dev = __in_dev_get_rcu(dev);
1537 unsigned int flags = 0;
1541 struct net *net = dev_net(dev);
1544 /* IP on this device is disabled. */
1549 /* Check for the most weird martians, which can be not detected
1553 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1554 goto martian_source;
1557 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1560 /* Accept zero addresses only to limited broadcast;
1561 * I even do not know to fix it or not. Waiting for complains :-)
1563 if (ipv4_is_zeronet(saddr))
1564 goto martian_source;
1566 if (ipv4_is_zeronet(daddr))
1567 goto martian_destination;
1569 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
1570 if (ipv4_is_loopback(daddr))
1571 goto martian_destination;
1573 if (ipv4_is_loopback(saddr))
1574 goto martian_source;
1578 * Now we are ready to route packet.
1581 fl4.flowi4_iif = dev->ifindex;
1582 fl4.flowi4_mark = skb->mark;
1583 fl4.flowi4_tos = tos;
1584 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1587 err = fib_lookup(net, &fl4, &res);
1591 RT_CACHE_STAT_INC(in_slow_tot);
1593 if (res.type == RTN_BROADCAST)
1596 if (res.type == RTN_LOCAL) {
1597 err = fib_validate_source(skb, saddr, daddr, tos,
1598 net->loopback_dev->ifindex,
1599 dev, in_dev, &itag);
1601 goto martian_source_keep_err;
1605 if (!IN_DEV_FORWARD(in_dev))
1607 if (res.type != RTN_UNICAST)
1608 goto martian_destination;
1610 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1614 if (skb->protocol != htons(ETH_P_IP))
1617 if (!ipv4_is_zeronet(saddr)) {
1618 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1621 goto martian_source_keep_err;
1623 flags |= RTCF_BROADCAST;
1624 res.type = RTN_BROADCAST;
1625 RT_CACHE_STAT_INC(in_brd);
1631 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1632 if (rt_cache_valid(rth)) {
1633 skb_dst_set_noref(skb, &rth->dst);
1641 rth = rt_dst_alloc(net->loopback_dev,
1642 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1646 rth->dst.input= ip_local_deliver;
1647 rth->dst.output= ip_rt_bug;
1648 #ifdef CONFIG_IP_ROUTE_CLASSID
1649 rth->dst.tclassid = itag;
1652 rth->rt_genid = rt_genid(net);
1653 rth->rt_flags = flags|RTCF_LOCAL;
1654 rth->rt_type = res.type;
1655 rth->rt_is_input = 1;
1658 rth->rt_gateway = 0;
1659 INIT_LIST_HEAD(&rth->rt_uncached);
1660 if (res.type == RTN_UNREACHABLE) {
1661 rth->dst.input= ip_error;
1662 rth->dst.error= -err;
1663 rth->rt_flags &= ~RTCF_LOCAL;
1666 rt_cache_route(&FIB_RES_NH(res), rth);
1667 skb_dst_set(skb, &rth->dst);
1672 RT_CACHE_STAT_INC(in_no_route);
1673 res.type = RTN_UNREACHABLE;
1679 * Do not cache martian addresses: they should be logged (RFC1812)
1681 martian_destination:
1682 RT_CACHE_STAT_INC(in_martian_dst);
1683 #ifdef CONFIG_IP_ROUTE_VERBOSE
1684 if (IN_DEV_LOG_MARTIANS(in_dev))
1685 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1686 &daddr, &saddr, dev->name);
1699 martian_source_keep_err:
1700 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1704 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1705 u8 tos, struct net_device *dev)
1711 /* Multicast recognition logic is moved from route cache to here.
1712 The problem was that too many Ethernet cards have broken/missing
1713 hardware multicast filters :-( As result the host on multicasting
1714 network acquires a lot of useless route cache entries, sort of
1715 SDR messages from all the world. Now we try to get rid of them.
1716 Really, provided software IP multicast filter is organized
1717 reasonably (at least, hashed), it does not result in a slowdown
1718 comparing with route cache reject entries.
1719 Note, that multicast routers are not affected, because
1720 route cache entry is created eventually.
1722 if (ipv4_is_multicast(daddr)) {
1723 struct in_device *in_dev = __in_dev_get_rcu(dev);
1726 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1727 ip_hdr(skb)->protocol);
1729 #ifdef CONFIG_IP_MROUTE
1731 (!ipv4_is_local_multicast(daddr) &&
1732 IN_DEV_MFORWARD(in_dev))
1735 int res = ip_route_input_mc(skb, daddr, saddr,
1744 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1748 EXPORT_SYMBOL(ip_route_input_noref);
1750 /* called with rcu_read_lock() */
1751 static struct rtable *__mkroute_output(const struct fib_result *res,
1752 const struct flowi4 *fl4, int orig_oif,
1753 struct net_device *dev_out,
1756 struct fib_info *fi = res->fi;
1757 struct fib_nh_exception *fnhe;
1758 struct in_device *in_dev;
1759 u16 type = res->type;
1762 in_dev = __in_dev_get_rcu(dev_out);
1764 return ERR_PTR(-EINVAL);
1766 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1767 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1768 return ERR_PTR(-EINVAL);
1770 if (ipv4_is_lbcast(fl4->daddr))
1771 type = RTN_BROADCAST;
1772 else if (ipv4_is_multicast(fl4->daddr))
1773 type = RTN_MULTICAST;
1774 else if (ipv4_is_zeronet(fl4->daddr))
1775 return ERR_PTR(-EINVAL);
1777 if (dev_out->flags & IFF_LOOPBACK)
1778 flags |= RTCF_LOCAL;
1780 if (type == RTN_BROADCAST) {
1781 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1783 } else if (type == RTN_MULTICAST) {
1784 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1785 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1787 flags &= ~RTCF_LOCAL;
1788 /* If multicast route do not exist use
1789 * default one, but do not gateway in this case.
1792 if (fi && res->prefixlen < 4)
1798 struct rtable __rcu **prth;
1800 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1802 prth = &fnhe->fnhe_rth;
1804 prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
1805 rth = rcu_dereference(*prth);
1806 if (rt_cache_valid(rth)) {
1807 dst_hold(&rth->dst);
1811 rth = rt_dst_alloc(dev_out,
1812 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1813 IN_DEV_CONF_GET(in_dev, NOXFRM),
1816 return ERR_PTR(-ENOBUFS);
1818 rth->dst.output = ip_output;
1820 rth->rt_genid = rt_genid(dev_net(dev_out));
1821 rth->rt_flags = flags;
1822 rth->rt_type = type;
1823 rth->rt_is_input = 0;
1824 rth->rt_iif = orig_oif ? : 0;
1826 rth->rt_gateway = 0;
1827 INIT_LIST_HEAD(&rth->rt_uncached);
1829 RT_CACHE_STAT_INC(out_slow_tot);
1831 if (flags & RTCF_LOCAL)
1832 rth->dst.input = ip_local_deliver;
1833 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1834 if (flags & RTCF_LOCAL &&
1835 !(dev_out->flags & IFF_LOOPBACK)) {
1836 rth->dst.output = ip_mc_output;
1837 RT_CACHE_STAT_INC(out_slow_mc);
1839 #ifdef CONFIG_IP_MROUTE
1840 if (type == RTN_MULTICAST) {
1841 if (IN_DEV_MFORWARD(in_dev) &&
1842 !ipv4_is_local_multicast(fl4->daddr)) {
1843 rth->dst.input = ip_mr_input;
1844 rth->dst.output = ip_mc_output;
1850 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1856 * Major route resolver routine.
1859 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1861 struct net_device *dev_out = NULL;
1862 __u8 tos = RT_FL_TOS(fl4);
1863 unsigned int flags = 0;
1864 struct fib_result res;
1872 orig_oif = fl4->flowi4_oif;
1874 fl4->flowi4_iif = net->loopback_dev->ifindex;
1875 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1876 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1877 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1881 rth = ERR_PTR(-EINVAL);
1882 if (ipv4_is_multicast(fl4->saddr) ||
1883 ipv4_is_lbcast(fl4->saddr) ||
1884 ipv4_is_zeronet(fl4->saddr))
1887 /* I removed check for oif == dev_out->oif here.
1888 It was wrong for two reasons:
1889 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1890 is assigned to multiple interfaces.
1891 2. Moreover, we are allowed to send packets with saddr
1892 of another iface. --ANK
1895 if (fl4->flowi4_oif == 0 &&
1896 (ipv4_is_multicast(fl4->daddr) ||
1897 ipv4_is_lbcast(fl4->daddr))) {
1898 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1899 dev_out = __ip_dev_find(net, fl4->saddr, false);
1900 if (dev_out == NULL)
1903 /* Special hack: user can direct multicasts
1904 and limited broadcast via necessary interface
1905 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1906 This hack is not just for fun, it allows
1907 vic,vat and friends to work.
1908 They bind socket to loopback, set ttl to zero
1909 and expect that it will work.
1910 From the viewpoint of routing cache they are broken,
1911 because we are not allowed to build multicast path
1912 with loopback source addr (look, routing cache
1913 cannot know, that ttl is zero, so that packet
1914 will not leave this host and route is valid).
1915 Luckily, this hack is good workaround.
1918 fl4->flowi4_oif = dev_out->ifindex;
1922 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1923 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1924 if (!__ip_dev_find(net, fl4->saddr, false))
1930 if (fl4->flowi4_oif) {
1931 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1932 rth = ERR_PTR(-ENODEV);
1933 if (dev_out == NULL)
1936 /* RACE: Check return value of inet_select_addr instead. */
1937 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1938 rth = ERR_PTR(-ENETUNREACH);
1941 if (ipv4_is_local_multicast(fl4->daddr) ||
1942 ipv4_is_lbcast(fl4->daddr)) {
1944 fl4->saddr = inet_select_addr(dev_out, 0,
1949 if (ipv4_is_multicast(fl4->daddr))
1950 fl4->saddr = inet_select_addr(dev_out, 0,
1952 else if (!fl4->daddr)
1953 fl4->saddr = inet_select_addr(dev_out, 0,
1959 fl4->daddr = fl4->saddr;
1961 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1962 dev_out = net->loopback_dev;
1963 fl4->flowi4_oif = net->loopback_dev->ifindex;
1964 res.type = RTN_LOCAL;
1965 flags |= RTCF_LOCAL;
1969 if (fib_lookup(net, fl4, &res)) {
1972 if (fl4->flowi4_oif) {
1973 /* Apparently, routing tables are wrong. Assume,
1974 that the destination is on link.
1977 Because we are allowed to send to iface
1978 even if it has NO routes and NO assigned
1979 addresses. When oif is specified, routing
1980 tables are looked up with only one purpose:
1981 to catch if destination is gatewayed, rather than
1982 direct. Moreover, if MSG_DONTROUTE is set,
1983 we send packet, ignoring both routing tables
1984 and ifaddr state. --ANK
1987 We could make it even if oif is unknown,
1988 likely IPv6, but we do not.
1991 if (fl4->saddr == 0)
1992 fl4->saddr = inet_select_addr(dev_out, 0,
1994 res.type = RTN_UNICAST;
1997 rth = ERR_PTR(-ENETUNREACH);
2001 if (res.type == RTN_LOCAL) {
2003 if (res.fi->fib_prefsrc)
2004 fl4->saddr = res.fi->fib_prefsrc;
2006 fl4->saddr = fl4->daddr;
2008 dev_out = net->loopback_dev;
2009 fl4->flowi4_oif = dev_out->ifindex;
2010 flags |= RTCF_LOCAL;
2014 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2015 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2016 fib_select_multipath(&res);
2019 if (!res.prefixlen &&
2020 res.table->tb_num_default > 1 &&
2021 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2022 fib_select_default(&res);
2025 fl4->saddr = FIB_RES_PREFSRC(net, res);
2027 dev_out = FIB_RES_DEV(res);
2028 fl4->flowi4_oif = dev_out->ifindex;
2032 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2038 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2040 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2045 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2047 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2049 return mtu ? : dst->dev->mtu;
2052 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2053 struct sk_buff *skb, u32 mtu)
2057 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2058 struct sk_buff *skb)
2062 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2068 static struct dst_ops ipv4_dst_blackhole_ops = {
2070 .protocol = cpu_to_be16(ETH_P_IP),
2071 .check = ipv4_blackhole_dst_check,
2072 .mtu = ipv4_blackhole_mtu,
2073 .default_advmss = ipv4_default_advmss,
2074 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2075 .redirect = ipv4_rt_blackhole_redirect,
2076 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2077 .neigh_lookup = ipv4_neigh_lookup,
2080 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2082 struct rtable *ort = (struct rtable *) dst_orig;
2085 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2087 struct dst_entry *new = &rt->dst;
2090 new->input = dst_discard;
2091 new->output = dst_discard;
2093 new->dev = ort->dst.dev;
2097 rt->rt_is_input = ort->rt_is_input;
2098 rt->rt_iif = ort->rt_iif;
2099 rt->rt_pmtu = ort->rt_pmtu;
2101 rt->rt_genid = rt_genid(net);
2102 rt->rt_flags = ort->rt_flags;
2103 rt->rt_type = ort->rt_type;
2104 rt->rt_gateway = ort->rt_gateway;
2106 INIT_LIST_HEAD(&rt->rt_uncached);
2111 dst_release(dst_orig);
2113 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2116 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2119 struct rtable *rt = __ip_route_output_key(net, flp4);
2124 if (flp4->flowi4_proto)
2125 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2126 flowi4_to_flowi(flp4),
2131 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2133 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2134 struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2135 u32 seq, int event, int nowait, unsigned int flags)
2137 struct rtable *rt = skb_rtable(skb);
2139 struct nlmsghdr *nlh;
2140 unsigned long expires = 0;
2142 u32 metrics[RTAX_MAX];
2144 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2148 r = nlmsg_data(nlh);
2149 r->rtm_family = AF_INET;
2150 r->rtm_dst_len = 32;
2152 r->rtm_tos = fl4->flowi4_tos;
2153 r->rtm_table = RT_TABLE_MAIN;
2154 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2155 goto nla_put_failure;
2156 r->rtm_type = rt->rt_type;
2157 r->rtm_scope = RT_SCOPE_UNIVERSE;
2158 r->rtm_protocol = RTPROT_UNSPEC;
2159 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2160 if (rt->rt_flags & RTCF_NOTIFY)
2161 r->rtm_flags |= RTM_F_NOTIFY;
2163 if (nla_put_be32(skb, RTA_DST, dst))
2164 goto nla_put_failure;
2166 r->rtm_src_len = 32;
2167 if (nla_put_be32(skb, RTA_SRC, src))
2168 goto nla_put_failure;
2171 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2172 goto nla_put_failure;
2173 #ifdef CONFIG_IP_ROUTE_CLASSID
2174 if (rt->dst.tclassid &&
2175 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2176 goto nla_put_failure;
2178 if (!rt_is_input_route(rt) &&
2179 fl4->saddr != src) {
2180 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2181 goto nla_put_failure;
2183 if (rt->rt_gateway &&
2184 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2185 goto nla_put_failure;
2187 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2189 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2190 if (rtnetlink_put_metrics(skb, metrics) < 0)
2191 goto nla_put_failure;
2193 if (fl4->flowi4_mark &&
2194 nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2195 goto nla_put_failure;
2197 error = rt->dst.error;
2198 expires = rt->dst.expires;
2200 if (time_before(jiffies, expires))
2206 if (rt_is_input_route(rt)) {
2207 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2208 goto nla_put_failure;
2211 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2212 goto nla_put_failure;
2214 return nlmsg_end(skb, nlh);
2217 nlmsg_cancel(skb, nlh);
2221 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2223 struct net *net = sock_net(in_skb->sk);
2225 struct nlattr *tb[RTA_MAX+1];
2226 struct rtable *rt = NULL;
2233 struct sk_buff *skb;
2235 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2239 rtm = nlmsg_data(nlh);
2241 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2247 /* Reserve room for dummy headers, this skb can pass
2248 through good chunk of routing engine.
2250 skb_reset_mac_header(skb);
2251 skb_reset_network_header(skb);
2253 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2254 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2255 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2257 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2258 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2259 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2260 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2262 memset(&fl4, 0, sizeof(fl4));
2265 fl4.flowi4_tos = rtm->rtm_tos;
2266 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2267 fl4.flowi4_mark = mark;
2270 struct net_device *dev;
2272 dev = __dev_get_by_index(net, iif);
2278 skb->protocol = htons(ETH_P_IP);
2282 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2285 rt = skb_rtable(skb);
2286 if (err == 0 && rt->dst.error)
2287 err = -rt->dst.error;
2289 rt = ip_route_output_key(net, &fl4);
2299 skb_dst_set(skb, &rt->dst);
2300 if (rtm->rtm_flags & RTM_F_NOTIFY)
2301 rt->rt_flags |= RTCF_NOTIFY;
2303 err = rt_fill_info(net, dst, src, &fl4, skb,
2304 NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2305 RTM_NEWROUTE, 0, 0);
2309 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2318 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2323 void ip_rt_multicast_event(struct in_device *in_dev)
2325 rt_cache_flush(dev_net(in_dev->dev));
2328 #ifdef CONFIG_SYSCTL
2329 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2330 void __user *buffer,
2331 size_t *lenp, loff_t *ppos)
2334 rt_cache_flush((struct net *)__ctl->extra1);
2341 static ctl_table ipv4_route_table[] = {
2343 .procname = "gc_thresh",
2344 .data = &ipv4_dst_ops.gc_thresh,
2345 .maxlen = sizeof(int),
2347 .proc_handler = proc_dointvec,
2350 .procname = "max_size",
2351 .data = &ip_rt_max_size,
2352 .maxlen = sizeof(int),
2354 .proc_handler = proc_dointvec,
2357 /* Deprecated. Use gc_min_interval_ms */
2359 .procname = "gc_min_interval",
2360 .data = &ip_rt_gc_min_interval,
2361 .maxlen = sizeof(int),
2363 .proc_handler = proc_dointvec_jiffies,
2366 .procname = "gc_min_interval_ms",
2367 .data = &ip_rt_gc_min_interval,
2368 .maxlen = sizeof(int),
2370 .proc_handler = proc_dointvec_ms_jiffies,
2373 .procname = "gc_timeout",
2374 .data = &ip_rt_gc_timeout,
2375 .maxlen = sizeof(int),
2377 .proc_handler = proc_dointvec_jiffies,
2380 .procname = "gc_interval",
2381 .data = &ip_rt_gc_interval,
2382 .maxlen = sizeof(int),
2384 .proc_handler = proc_dointvec_jiffies,
2387 .procname = "redirect_load",
2388 .data = &ip_rt_redirect_load,
2389 .maxlen = sizeof(int),
2391 .proc_handler = proc_dointvec,
2394 .procname = "redirect_number",
2395 .data = &ip_rt_redirect_number,
2396 .maxlen = sizeof(int),
2398 .proc_handler = proc_dointvec,
2401 .procname = "redirect_silence",
2402 .data = &ip_rt_redirect_silence,
2403 .maxlen = sizeof(int),
2405 .proc_handler = proc_dointvec,
2408 .procname = "error_cost",
2409 .data = &ip_rt_error_cost,
2410 .maxlen = sizeof(int),
2412 .proc_handler = proc_dointvec,
2415 .procname = "error_burst",
2416 .data = &ip_rt_error_burst,
2417 .maxlen = sizeof(int),
2419 .proc_handler = proc_dointvec,
2422 .procname = "gc_elasticity",
2423 .data = &ip_rt_gc_elasticity,
2424 .maxlen = sizeof(int),
2426 .proc_handler = proc_dointvec,
2429 .procname = "mtu_expires",
2430 .data = &ip_rt_mtu_expires,
2431 .maxlen = sizeof(int),
2433 .proc_handler = proc_dointvec_jiffies,
2436 .procname = "min_pmtu",
2437 .data = &ip_rt_min_pmtu,
2438 .maxlen = sizeof(int),
2440 .proc_handler = proc_dointvec,
2443 .procname = "min_adv_mss",
2444 .data = &ip_rt_min_advmss,
2445 .maxlen = sizeof(int),
2447 .proc_handler = proc_dointvec,
2452 static struct ctl_table ipv4_route_flush_table[] = {
2454 .procname = "flush",
2455 .maxlen = sizeof(int),
2457 .proc_handler = ipv4_sysctl_rtcache_flush,
2462 static __net_init int sysctl_route_net_init(struct net *net)
2464 struct ctl_table *tbl;
2466 tbl = ipv4_route_flush_table;
2467 if (!net_eq(net, &init_net)) {
2468 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2472 tbl[0].extra1 = net;
2474 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2475 if (net->ipv4.route_hdr == NULL)
2480 if (tbl != ipv4_route_flush_table)
2486 static __net_exit void sysctl_route_net_exit(struct net *net)
2488 struct ctl_table *tbl;
2490 tbl = net->ipv4.route_hdr->ctl_table_arg;
2491 unregister_net_sysctl_table(net->ipv4.route_hdr);
2492 BUG_ON(tbl == ipv4_route_flush_table);
2496 static __net_initdata struct pernet_operations sysctl_route_ops = {
2497 .init = sysctl_route_net_init,
2498 .exit = sysctl_route_net_exit,
2502 static __net_init int rt_genid_init(struct net *net)
2504 atomic_set(&net->rt_genid, 0);
2505 get_random_bytes(&net->ipv4.dev_addr_genid,
2506 sizeof(net->ipv4.dev_addr_genid));
2510 static __net_initdata struct pernet_operations rt_genid_ops = {
2511 .init = rt_genid_init,
2514 static int __net_init ipv4_inetpeer_init(struct net *net)
2516 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2520 inet_peer_base_init(bp);
2521 net->ipv4.peers = bp;
2525 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2527 struct inet_peer_base *bp = net->ipv4.peers;
2529 net->ipv4.peers = NULL;
2530 inetpeer_invalidate_tree(bp);
2534 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2535 .init = ipv4_inetpeer_init,
2536 .exit = ipv4_inetpeer_exit,
2539 #ifdef CONFIG_IP_ROUTE_CLASSID
2540 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2541 #endif /* CONFIG_IP_ROUTE_CLASSID */
2543 int __init ip_rt_init(void)
2547 #ifdef CONFIG_IP_ROUTE_CLASSID
2548 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2550 panic("IP: failed to allocate ip_rt_acct\n");
2553 ipv4_dst_ops.kmem_cachep =
2554 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2555 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2557 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2559 if (dst_entries_init(&ipv4_dst_ops) < 0)
2560 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2562 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2563 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2565 ipv4_dst_ops.gc_thresh = ~0;
2566 ip_rt_max_size = INT_MAX;
2571 if (ip_rt_proc_init())
2572 pr_err("Unable to create route proc files\n");
2575 xfrm4_init(ip_rt_max_size);
2577 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2579 #ifdef CONFIG_SYSCTL
2580 register_pernet_subsys(&sysctl_route_ops);
2582 register_pernet_subsys(&rt_genid_ops);
2583 register_pernet_subsys(&ipv4_inetpeer_ops);
2587 #ifdef CONFIG_SYSCTL
2589 * We really need to sanitize the damn ipv4 init order, then all
2590 * this nonsense will go away.
2592 void __init ip_static_sysctl_init(void)
2594 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);