net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #define RT_FL_TOS(oldflp4) \
 118         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 119
 120 #define RT_GC_TIMEOUT (300*HZ)
 121
 122 static int ip_rt_max_size;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131
 132 /*
 133  *      Interface to generic destination cache.
 134  */
 135
 136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 137 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 138 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 140 static void              ipv4_link_failure(struct sk_buff *skb);
 141 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 142                                            struct sk_buff *skb, u32 mtu);
 143 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 144                                         struct sk_buff *skb);
 145 static void             ipv4_dst_destroy(struct dst_entry *dst);
 146
 147 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 148 {
 149         WARN_ON(1);
 150         return NULL;
 151 }
 152
 153 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 154                                            struct sk_buff *skb,
 155                                            const void *daddr);
 156
 157 static struct dst_ops ipv4_dst_ops = {
 158         .family =               AF_INET,
 159         .check =                ipv4_dst_check,
 160         .default_advmss =       ipv4_default_advmss,
 161         .mtu =                  ipv4_mtu,
 162         .cow_metrics =          ipv4_cow_metrics,
 163         .destroy =              ipv4_dst_destroy,
 164         .negative_advice =      ipv4_negative_advice,
 165         .link_failure =         ipv4_link_failure,
 166         .update_pmtu =          ip_rt_update_pmtu,
 167         .redirect =             ip_do_redirect,
 168         .local_out =            __ip_local_out,
 169         .neigh_lookup =         ipv4_neigh_lookup,
 170 };
 171
 172 #define ECN_OR_COST(class)      TC_PRIO_##class
 173
 174 const __u8 ip_tos2prio[16] = {
 175         TC_PRIO_BESTEFFORT,
 176         ECN_OR_COST(BESTEFFORT),
 177         TC_PRIO_BESTEFFORT,
 178         ECN_OR_COST(BESTEFFORT),
 179         TC_PRIO_BULK,
 180         ECN_OR_COST(BULK),
 181         TC_PRIO_BULK,
 182         ECN_OR_COST(BULK),
 183         TC_PRIO_INTERACTIVE,
 184         ECN_OR_COST(INTERACTIVE),
 185         TC_PRIO_INTERACTIVE,
 186         ECN_OR_COST(INTERACTIVE),
 187         TC_PRIO_INTERACTIVE_BULK,
 188         ECN_OR_COST(INTERACTIVE_BULK),
 189         TC_PRIO_INTERACTIVE_BULK,
 190         ECN_OR_COST(INTERACTIVE_BULK)
 191 };
 192 EXPORT_SYMBOL(ip_tos2prio);
 193
 194 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 195 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 196
 197 #ifdef CONFIG_PROC_FS
 198 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 199 {
 200         if (*pos)
 201                 return NULL;
 202         return SEQ_START_TOKEN;
 203 }
 204
 205 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 206 {
 207         ++*pos;
 208         return NULL;
 209 }
 210
 211 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 212 {
 213 }
 214
 215 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 216 {
 217         if (v == SEQ_START_TOKEN)
 218                 seq_printf(seq, "%-127s\n",
 219                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 220                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 221                            "HHUptod\tSpecDst");
 222         return 0;
 223 }
 224
 225 static const struct seq_operations rt_cache_seq_ops = {
 226         .start  = rt_cache_seq_start,
 227         .next   = rt_cache_seq_next,
 228         .stop   = rt_cache_seq_stop,
 229         .show   = rt_cache_seq_show,
 230 };
 231
 232 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 233 {
 234         return seq_open(file, &rt_cache_seq_ops);
 235 }
 236
 237 static const struct file_operations rt_cache_seq_fops = {
 238         .owner   = THIS_MODULE,
 239         .open    = rt_cache_seq_open,
 240         .read    = seq_read,
 241         .llseek  = seq_lseek,
 242         .release = seq_release,
 243 };
 244
 245
 246 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 247 {
 248         int cpu;
 249
 250         if (*pos == 0)
 251                 return SEQ_START_TOKEN;
 252
 253         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 254                 if (!cpu_possible(cpu))
 255                         continue;
 256                 *pos = cpu+1;
 257                 return &per_cpu(rt_cache_stat, cpu);
 258         }
 259         return NULL;
 260 }
 261
 262 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 263 {
 264         int cpu;
 265
 266         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 267                 if (!cpu_possible(cpu))
 268                         continue;
 269                 *pos = cpu+1;
 270                 return &per_cpu(rt_cache_stat, cpu);
 271         }
 272         return NULL;
 273
 274 }
 275
 276 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 277 {
 278
 279 }
 280
 281 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 282 {
 283         struct rt_cache_stat *st = v;
 284
 285         if (v == SEQ_START_TOKEN) {
 286                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 287                 return 0;
 288         }
 289
 290         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 291                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 292                    dst_entries_get_slow(&ipv4_dst_ops),
 293                    0, /* st->in_hit */
 294                    st->in_slow_tot,
 295                    st->in_slow_mc,
 296                    st->in_no_route,
 297                    st->in_brd,
 298                    st->in_martian_dst,
 299                    st->in_martian_src,
 300
 301                    0, /* st->out_hit */
 302                    st->out_slow_tot,
 303                    st->out_slow_mc,
 304
 305                    0, /* st->gc_total */
 306                    0, /* st->gc_ignored */
 307                    0, /* st->gc_goal_miss */
 308                    0, /* st->gc_dst_overflow */
 309                    0, /* st->in_hlist_search */
 310                    0  /* st->out_hlist_search */
 311                 );
 312         return 0;
 313 }
 314
 315 static const struct seq_operations rt_cpu_seq_ops = {
 316         .start  = rt_cpu_seq_start,
 317         .next   = rt_cpu_seq_next,
 318         .stop   = rt_cpu_seq_stop,
 319         .show   = rt_cpu_seq_show,
 320 };
 321
 322
 323 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 324 {
 325         return seq_open(file, &rt_cpu_seq_ops);
 326 }
 327
 328 static const struct file_operations rt_cpu_seq_fops = {
 329         .owner   = THIS_MODULE,
 330         .open    = rt_cpu_seq_open,
 331         .read    = seq_read,
 332         .llseek  = seq_lseek,
 333         .release = seq_release,
 334 };
 335
 336 #ifdef CONFIG_IP_ROUTE_CLASSID
 337 static int rt_acct_proc_show(struct seq_file *m, void *v)
 338 {
 339         struct ip_rt_acct *dst, *src;
 340         unsigned int i, j;
 341
 342         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 343         if (!dst)
 344                 return -ENOMEM;
 345
 346         for_each_possible_cpu(i) {
 347                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 348                 for (j = 0; j < 256; j++) {
 349                         dst[j].o_bytes   += src[j].o_bytes;
 350                         dst[j].o_packets += src[j].o_packets;
 351                         dst[j].i_bytes   += src[j].i_bytes;
 352                         dst[j].i_packets += src[j].i_packets;
 353                 }
 354         }
 355
 356         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 357         kfree(dst);
 358         return 0;
 359 }
 360
 361 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 362 {
 363         return single_open(file, rt_acct_proc_show, NULL);
 364 }
 365
 366 static const struct file_operations rt_acct_proc_fops = {
 367         .owner          = THIS_MODULE,
 368         .open           = rt_acct_proc_open,
 369         .read           = seq_read,
 370         .llseek         = seq_lseek,
 371         .release        = single_release,
 372 };
 373 #endif
 374
 375 static int __net_init ip_rt_do_proc_init(struct net *net)
 376 {
 377         struct proc_dir_entry *pde;
 378
 379         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 380                           &rt_cache_seq_fops);
 381         if (!pde)
 382                 goto err1;
 383
 384         pde = proc_create("rt_cache", S_IRUGO,
 385                           net->proc_net_stat, &rt_cpu_seq_fops);
 386         if (!pde)
 387                 goto err2;
 388
 389 #ifdef CONFIG_IP_ROUTE_CLASSID
 390         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 391         if (!pde)
 392                 goto err3;
 393 #endif
 394         return 0;
 395
 396 #ifdef CONFIG_IP_ROUTE_CLASSID
 397 err3:
 398         remove_proc_entry("rt_cache", net->proc_net_stat);
 399 #endif
 400 err2:
 401         remove_proc_entry("rt_cache", net->proc_net);
 402 err1:
 403         return -ENOMEM;
 404 }
 405
 406 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 407 {
 408         remove_proc_entry("rt_cache", net->proc_net_stat);
 409         remove_proc_entry("rt_cache", net->proc_net);
 410 #ifdef CONFIG_IP_ROUTE_CLASSID
 411         remove_proc_entry("rt_acct", net->proc_net);
 412 #endif
 413 }
 414
 415 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 416         .init = ip_rt_do_proc_init,
 417         .exit = ip_rt_do_proc_exit,
 418 };
 419
 420 static int __init ip_rt_proc_init(void)
 421 {
 422         return register_pernet_subsys(&ip_rt_proc_ops);
 423 }
 424
 425 #else
 426 static inline int ip_rt_proc_init(void)
 427 {
 428         return 0;
 429 }
 430 #endif /* CONFIG_PROC_FS */
 431
 432 static inline bool rt_is_expired(const struct rtable *rth)
 433 {
 434         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 435 }
 436
 437 void rt_cache_flush(struct net *net)
 438 {
 439         rt_genid_bump_ipv4(net);
 440 }
 441
 442 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 443                                            struct sk_buff *skb,
 444                                            const void *daddr)
 445 {
 446         struct net_device *dev = dst->dev;
 447         const __be32 *pkey = daddr;
 448         const struct rtable *rt;
 449         struct neighbour *n;
 450
 451         rt = (const struct rtable *) dst;
 452         if (rt->rt_gateway)
 453                 pkey = (const __be32 *) &rt->rt_gateway;
 454         else if (skb)
 455                 pkey = &ip_hdr(skb)->daddr;
 456
 457         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 458         if (n)
 459                 return n;
 460         return neigh_create(&arp_tbl, pkey, dev);
 461 }
 462
 463 #define IP_IDENTS_SZ 2048u
 464
 465 static atomic_t *ip_idents __read_mostly;
 466 static u32 *ip_tstamps __read_mostly;
 467
 468 /* In order to protect privacy, we add a perturbation to identifiers
 469  * if one generator is seldom used. This makes hard for an attacker
 470  * to infer how many packets were sent between two points in time.
 471  */
 472 u32 ip_idents_reserve(u32 hash, int segs)
 473 {
 474         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 475         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 476         u32 old = ACCESS_ONCE(*p_tstamp);
 477         u32 now = (u32)jiffies;
 478         u32 delta = 0;
 479
 480         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 481                 delta = prandom_u32_max(now - old);
 482
 483         return atomic_add_return(segs + delta, p_id) - segs;
 484 }
 485 EXPORT_SYMBOL(ip_idents_reserve);
 486
 487 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 488 {
 489         static u32 ip_idents_hashrnd __read_mostly;
 490         u32 hash, id;
 491
 492         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 493
 494         hash = jhash_3words((__force u32)iph->daddr,
 495                             (__force u32)iph->saddr,
 496                             iph->protocol ^ net_hash_mix(net),
 497                             ip_idents_hashrnd);
 498         id = ip_idents_reserve(hash, segs);
 499         iph->id = htons(id);
 500 }
 501 EXPORT_SYMBOL(__ip_select_ident);
 502
 503 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 504                              const struct iphdr *iph,
 505                              int oif, u8 tos,
 506                              u8 prot, u32 mark, int flow_flags)
 507 {
 508         if (sk) {
 509                 const struct inet_sock *inet = inet_sk(sk);
 510
 511                 oif = sk->sk_bound_dev_if;
 512                 mark = sk->sk_mark;
 513                 tos = RT_CONN_FLAGS(sk);
 514                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 515         }
 516         flowi4_init_output(fl4, oif, mark, tos,
 517                            RT_SCOPE_UNIVERSE, prot,
 518                            flow_flags,
 519                            iph->daddr, iph->saddr, 0, 0);
 520 }
 521
 522 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 523                                const struct sock *sk)
 524 {
 525         const struct iphdr *iph = ip_hdr(skb);
 526         int oif = skb->dev->ifindex;
 527         u8 tos = RT_TOS(iph->tos);
 528         u8 prot = iph->protocol;
 529         u32 mark = skb->mark;
 530
 531         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 532 }
 533
 534 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 535 {
 536         const struct inet_sock *inet = inet_sk(sk);
 537         const struct ip_options_rcu *inet_opt;
 538         __be32 daddr = inet->inet_daddr;
 539
 540         rcu_read_lock();
 541         inet_opt = rcu_dereference(inet->inet_opt);
 542         if (inet_opt && inet_opt->opt.srr)
 543                 daddr = inet_opt->opt.faddr;
 544         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 545                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 546                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 547                            inet_sk_flowi_flags(sk),
 548                            daddr, inet->inet_saddr, 0, 0);
 549         rcu_read_unlock();
 550 }
 551
 552 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 553                                  const struct sk_buff *skb)
 554 {
 555         if (skb)
 556                 build_skb_flow_key(fl4, skb, sk);
 557         else
 558                 build_sk_flow_key(fl4, sk);
 559 }
 560
 561 static inline void rt_free(struct rtable *rt)
 562 {
 563         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 564 }
 565
 566 static DEFINE_SPINLOCK(fnhe_lock);
 567
 568 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 569 {
 570         struct rtable *rt;
 571
 572         rt = rcu_dereference(fnhe->fnhe_rth_input);
 573         if (rt) {
 574                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 575                 rt_free(rt);
 576         }
 577         rt = rcu_dereference(fnhe->fnhe_rth_output);
 578         if (rt) {
 579                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 580                 rt_free(rt);
 581         }
 582 }
 583
 584 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 585 {
 586         struct fib_nh_exception *fnhe, *oldest;
 587
 588         oldest = rcu_dereference(hash->chain);
 589         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 590              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 591                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 592                         oldest = fnhe;
 593         }
 594         fnhe_flush_routes(oldest);
 595         return oldest;
 596 }
 597
 598 static inline u32 fnhe_hashfun(__be32 daddr)
 599 {
 600         static u32 fnhe_hashrnd __read_mostly;
 601         u32 hval;
 602
 603         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 604         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 605         return hash_32(hval, FNHE_HASH_SHIFT);
 606 }
 607
 608 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 609 {
 610         rt->rt_pmtu = fnhe->fnhe_pmtu;
 611         rt->dst.expires = fnhe->fnhe_expires;
 612
 613         if (fnhe->fnhe_gw) {
 614                 rt->rt_flags |= RTCF_REDIRECTED;
 615                 rt->rt_gateway = fnhe->fnhe_gw;
 616                 rt->rt_uses_gateway = 1;
 617         }
 618 }
 619
 620 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 621                                   u32 pmtu, unsigned long expires)
 622 {
 623         struct fnhe_hash_bucket *hash;
 624         struct fib_nh_exception *fnhe;
 625         struct rtable *rt;
 626         unsigned int i;
 627         int depth;
 628         u32 hval = fnhe_hashfun(daddr);
 629
 630         spin_lock_bh(&fnhe_lock);
 631
 632         hash = rcu_dereference(nh->nh_exceptions);
 633         if (!hash) {
 634                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 635                 if (!hash)
 636                         goto out_unlock;
 637                 rcu_assign_pointer(nh->nh_exceptions, hash);
 638         }
 639
 640         hash += hval;
 641
 642         depth = 0;
 643         for (fnhe = rcu_dereference(hash->chain); fnhe;
 644              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 645                 if (fnhe->fnhe_daddr == daddr)
 646                         break;
 647                 depth++;
 648         }
 649
 650         if (fnhe) {
 651                 if (gw)
 652                         fnhe->fnhe_gw = gw;
 653                 if (pmtu) {
 654                         fnhe->fnhe_pmtu = pmtu;
 655                         fnhe->fnhe_expires = max(1UL, expires);
 656                 }
 657                 /* Update all cached dsts too */
 658                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 659                 if (rt)
 660                         fill_route_from_fnhe(rt, fnhe);
 661                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 662                 if (rt)
 663                         fill_route_from_fnhe(rt, fnhe);
 664         } else {
 665                 if (depth > FNHE_RECLAIM_DEPTH)
 666                         fnhe = fnhe_oldest(hash);
 667                 else {
 668                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 669                         if (!fnhe)
 670                                 goto out_unlock;
 671
 672                         fnhe->fnhe_next = hash->chain;
 673                         rcu_assign_pointer(hash->chain, fnhe);
 674                 }
 675                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 676                 fnhe->fnhe_daddr = daddr;
 677                 fnhe->fnhe_gw = gw;
 678                 fnhe->fnhe_pmtu = pmtu;
 679                 fnhe->fnhe_expires = expires;
 680
 681                 /* Exception created; mark the cached routes for the nexthop
 682                  * stale, so anyone caching it rechecks if this exception
 683                  * applies to them.
 684                  */
 685                 rt = rcu_dereference(nh->nh_rth_input);
 686                 if (rt)
 687                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 688
 689                 for_each_possible_cpu(i) {
 690                         struct rtable __rcu **prt;
 691                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 692                         rt = rcu_dereference(*prt);
 693                         if (rt)
 694                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 695                 }
 696         }
 697
 698         fnhe->fnhe_stamp = jiffies;
 699
 700 out_unlock:
 701         spin_unlock_bh(&fnhe_lock);
 702 }
 703
 704 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 705                              bool kill_route)
 706 {
 707         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 708         __be32 old_gw = ip_hdr(skb)->saddr;
 709         struct net_device *dev = skb->dev;
 710         struct in_device *in_dev;
 711         struct fib_result res;
 712         struct neighbour *n;
 713         struct net *net;
 714
 715         switch (icmp_hdr(skb)->code & 7) {
 716         case ICMP_REDIR_NET:
 717         case ICMP_REDIR_NETTOS:
 718         case ICMP_REDIR_HOST:
 719         case ICMP_REDIR_HOSTTOS:
 720                 break;
 721
 722         default:
 723                 return;
 724         }
 725
 726         if (rt->rt_gateway != old_gw)
 727                 return;
 728
 729         in_dev = __in_dev_get_rcu(dev);
 730         if (!in_dev)
 731                 return;
 732
 733         net = dev_net(dev);
 734         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 735             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 736             ipv4_is_zeronet(new_gw))
 737                 goto reject_redirect;
 738
 739         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 740                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 741                         goto reject_redirect;
 742                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 743                         goto reject_redirect;
 744         } else {
 745                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 746                         goto reject_redirect;
 747         }
 748
 749         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 750         if (!IS_ERR(n)) {
 751                 if (!(n->nud_state & NUD_VALID)) {
 752                         neigh_event_send(n, NULL);
 753                 } else {
 754                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 755                                 struct fib_nh *nh = &FIB_RES_NH(res);
 756
 757                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 758                                                       0, 0);
 759                         }
 760                         if (kill_route)
 761                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 762                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 763                 }
 764                 neigh_release(n);
 765         }
 766         return;
 767
 768 reject_redirect:
 769 #ifdef CONFIG_IP_ROUTE_VERBOSE
 770         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 771                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 772                 __be32 daddr = iph->daddr;
 773                 __be32 saddr = iph->saddr;
 774
 775                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 776                                      "  Advised path = %pI4 -> %pI4\n",
 777                                      &old_gw, dev->name, &new_gw,
 778                                      &saddr, &daddr);
 779         }
 780 #endif
 781         ;
 782 }
 783
 784 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 785 {
 786         struct rtable *rt;
 787         struct flowi4 fl4;
 788         const struct iphdr *iph = (const struct iphdr *) skb->data;
 789         int oif = skb->dev->ifindex;
 790         u8 tos = RT_TOS(iph->tos);
 791         u8 prot = iph->protocol;
 792         u32 mark = skb->mark;
 793
 794         rt = (struct rtable *) dst;
 795
 796         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 797         __ip_do_redirect(rt, skb, &fl4, true);
 798 }
 799
 800 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 801 {
 802         struct rtable *rt = (struct rtable *)dst;
 803         struct dst_entry *ret = dst;
 804
 805         if (rt) {
 806                 if (dst->obsolete > 0) {
 807                         ip_rt_put(rt);
 808                         ret = NULL;
 809                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 810                            rt->dst.expires) {
 811                         ip_rt_put(rt);
 812                         ret = NULL;
 813                 }
 814         }
 815         return ret;
 816 }
 817
 818 /*
 819  * Algorithm:
 820  *      1. The first ip_rt_redirect_number redirects are sent
 821  *         with exponential backoff, then we stop sending them at all,
 822  *         assuming that the host ignores our redirects.
 823  *      2. If we did not see packets requiring redirects
 824  *         during ip_rt_redirect_silence, we assume that the host
 825  *         forgot redirected route and start to send redirects again.
 826  *
 827  * This algorithm is much cheaper and more intelligent than dumb load limiting
 828  * in icmp.c.
 829  *
 830  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 831  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 832  */
 833
 834 void ip_rt_send_redirect(struct sk_buff *skb)
 835 {
 836         struct rtable *rt = skb_rtable(skb);
 837         struct in_device *in_dev;
 838         struct inet_peer *peer;
 839         struct net *net;
 840         int log_martians;
 841         int vif;
 842
 843         rcu_read_lock();
 844         in_dev = __in_dev_get_rcu(rt->dst.dev);
 845         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 846                 rcu_read_unlock();
 847                 return;
 848         }
 849         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 850         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 851         rcu_read_unlock();
 852
 853         net = dev_net(rt->dst.dev);
 854         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 855         if (!peer) {
 856                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 857                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 858                 return;
 859         }
 860
 861         /* No redirected packets during ip_rt_redirect_silence;
 862          * reset the algorithm.
 863          */
 864         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 865                 peer->rate_tokens = 0;
 866
 867         /* Too many ignored redirects; do not send anything
 868          * set dst.rate_last to the last seen redirected packet.
 869          */
 870         if (peer->rate_tokens >= ip_rt_redirect_number) {
 871                 peer->rate_last = jiffies;
 872                 goto out_put_peer;
 873         }
 874
 875         /* Check for load limit; set rate_last to the latest sent
 876          * redirect.
 877          */
 878         if (peer->rate_tokens == 0 ||
 879             time_after(jiffies,
 880                        (peer->rate_last +
 881                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 882                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 883
 884                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 885                 peer->rate_last = jiffies;
 886                 ++peer->rate_tokens;
 887 #ifdef CONFIG_IP_ROUTE_VERBOSE
 888                 if (log_martians &&
 889                     peer->rate_tokens == ip_rt_redirect_number)
 890                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 891                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 892                                              &ip_hdr(skb)->daddr, &gw);
 893 #endif
 894         }
 895 out_put_peer:
 896         inet_putpeer(peer);
 897 }
 898
 899 static int ip_error(struct sk_buff *skb)
 900 {
 901         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 902         struct rtable *rt = skb_rtable(skb);
 903         struct inet_peer *peer;
 904         unsigned long now;
 905         struct net *net;
 906         bool send;
 907         int code;
 908
 909         /* IP on this device is disabled. */
 910         if (!in_dev)
 911                 goto out;
 912
 913         net = dev_net(rt->dst.dev);
 914         if (!IN_DEV_FORWARD(in_dev)) {
 915                 switch (rt->dst.error) {
 916                 case EHOSTUNREACH:
 917                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 918                         break;
 919
 920                 case ENETUNREACH:
 921                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 922                         break;
 923                 }
 924                 goto out;
 925         }
 926
 927         switch (rt->dst.error) {
 928         case EINVAL:
 929         default:
 930                 goto out;
 931         case EHOSTUNREACH:
 932                 code = ICMP_HOST_UNREACH;
 933                 break;
 934         case ENETUNREACH:
 935                 code = ICMP_NET_UNREACH;
 936                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 937                 break;
 938         case EACCES:
 939                 code = ICMP_PKT_FILTERED;
 940                 break;
 941         }
 942
 943         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 944                                l3mdev_master_ifindex(skb->dev), 1);
 945
 946         send = true;
 947         if (peer) {
 948                 now = jiffies;
 949                 peer->rate_tokens += now - peer->rate_last;
 950                 if (peer->rate_tokens > ip_rt_error_burst)
 951                         peer->rate_tokens = ip_rt_error_burst;
 952                 peer->rate_last = now;
 953                 if (peer->rate_tokens >= ip_rt_error_cost)
 954                         peer->rate_tokens -= ip_rt_error_cost;
 955                 else
 956                         send = false;
 957                 inet_putpeer(peer);
 958         }
 959         if (send)
 960                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 961
 962 out:    kfree_skb(skb);
 963         return 0;
 964 }
 965
 966 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 967 {
 968         struct dst_entry *dst = &rt->dst;
 969         struct fib_result res;
 970
 971         if (dst_metric_locked(dst, RTAX_MTU))
 972                 return;
 973
 974         if (ipv4_mtu(dst) < mtu)
 975                 return;
 976
 977         if (mtu < ip_rt_min_pmtu)
 978                 mtu = ip_rt_min_pmtu;
 979
 980         if (rt->rt_pmtu == mtu &&
 981             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
 982                 return;
 983
 984         rcu_read_lock();
 985         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
 986                 struct fib_nh *nh = &FIB_RES_NH(res);
 987
 988                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 989                                       jiffies + ip_rt_mtu_expires);
 990         }
 991         rcu_read_unlock();
 992 }
 993
 994 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 995                               struct sk_buff *skb, u32 mtu)
 996 {
 997         struct rtable *rt = (struct rtable *) dst;
 998         struct flowi4 fl4;
 999
1000         ip_rt_build_flow_key(&fl4, sk, skb);
1001         __ip_rt_update_pmtu(rt, &fl4, mtu);
1002 }
1003
1004 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1005                       int oif, u32 mark, u8 protocol, int flow_flags)
1006 {
1007         const struct iphdr *iph = (const struct iphdr *) skb->data;
1008         struct flowi4 fl4;
1009         struct rtable *rt;
1010
1011         if (!mark)
1012                 mark = IP4_REPLY_MARK(net, skb->mark);
1013
1014         __build_flow_key(&fl4, NULL, iph, oif,
1015                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1016         rt = __ip_route_output_key(net, &fl4);
1017         if (!IS_ERR(rt)) {
1018                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1019                 ip_rt_put(rt);
1020         }
1021 }
1022 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1023
1024 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1025 {
1026         const struct iphdr *iph = (const struct iphdr *) skb->data;
1027         struct flowi4 fl4;
1028         struct rtable *rt;
1029
1030         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1031
1032         if (!fl4.flowi4_mark)
1033                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1034
1035         rt = __ip_route_output_key(sock_net(sk), &fl4);
1036         if (!IS_ERR(rt)) {
1037                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1038                 ip_rt_put(rt);
1039         }
1040 }
1041
1042 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1043 {
1044         const struct iphdr *iph = (const struct iphdr *) skb->data;
1045         struct flowi4 fl4;
1046         struct rtable *rt;
1047         struct dst_entry *odst = NULL;
1048         bool new = false;
1049
1050         bh_lock_sock(sk);
1051
1052         if (!ip_sk_accept_pmtu(sk))
1053                 goto out;
1054
1055         odst = sk_dst_get(sk);
1056
1057         if (sock_owned_by_user(sk) || !odst) {
1058                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1059                 goto out;
1060         }
1061
1062         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1063
1064         rt = (struct rtable *)odst;
1065         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1066                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1067                 if (IS_ERR(rt))
1068                         goto out;
1069
1070                 new = true;
1071         }
1072
1073         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1074
1075         if (!dst_check(&rt->dst, 0)) {
1076                 if (new)
1077                         dst_release(&rt->dst);
1078
1079                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1080                 if (IS_ERR(rt))
1081                         goto out;
1082
1083                 new = true;
1084         }
1085
1086         if (new)
1087                 sk_dst_set(sk, &rt->dst);
1088
1089 out:
1090         bh_unlock_sock(sk);
1091         dst_release(odst);
1092 }
1093 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1094
1095 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1096                    int oif, u32 mark, u8 protocol, int flow_flags)
1097 {
1098         const struct iphdr *iph = (const struct iphdr *) skb->data;
1099         struct flowi4 fl4;
1100         struct rtable *rt;
1101
1102         __build_flow_key(&fl4, NULL, iph, oif,
1103                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1104         rt = __ip_route_output_key(net, &fl4);
1105         if (!IS_ERR(rt)) {
1106                 __ip_do_redirect(rt, skb, &fl4, false);
1107                 ip_rt_put(rt);
1108         }
1109 }
1110 EXPORT_SYMBOL_GPL(ipv4_redirect);
1111
1112 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1113 {
1114         const struct iphdr *iph = (const struct iphdr *) skb->data;
1115         struct flowi4 fl4;
1116         struct rtable *rt;
1117
1118         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1119         rt = __ip_route_output_key(sock_net(sk), &fl4);
1120         if (!IS_ERR(rt)) {
1121                 __ip_do_redirect(rt, skb, &fl4, false);
1122                 ip_rt_put(rt);
1123         }
1124 }
1125 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1126
1127 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1128 {
1129         struct rtable *rt = (struct rtable *) dst;
1130
1131         /* All IPV4 dsts are created with ->obsolete set to the value
1132          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1133          * into this function always.
1134          *
1135          * When a PMTU/redirect information update invalidates a route,
1136          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1137          * DST_OBSOLETE_DEAD by dst_free().
1138          */
1139         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1140                 return NULL;
1141         return dst;
1142 }
1143
1144 static void ipv4_link_failure(struct sk_buff *skb)
1145 {
1146         struct rtable *rt;
1147
1148         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1149
1150         rt = skb_rtable(skb);
1151         if (rt)
1152                 dst_set_expires(&rt->dst, 0);
1153 }
1154
1155 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1156 {
1157         pr_debug("%s: %pI4 -> %pI4, %s\n",
1158                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1159                  skb->dev ? skb->dev->name : "?");
1160         kfree_skb(skb);
1161         WARN_ON(1);
1162         return 0;
1163 }
1164
1165 /*
1166    We do not cache source address of outgoing interface,
1167    because it is used only by IP RR, TS and SRR options,
1168    so that it out of fast path.
1169
1170    BTW remember: "addr" is allowed to be not aligned
1171    in IP options!
1172  */
1173
1174 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1175 {
1176         __be32 src;
1177
1178         if (rt_is_output_route(rt))
1179                 src = ip_hdr(skb)->saddr;
1180         else {
1181                 struct fib_result res;
1182                 struct flowi4 fl4;
1183                 struct iphdr *iph;
1184
1185                 iph = ip_hdr(skb);
1186
1187                 memset(&fl4, 0, sizeof(fl4));
1188                 fl4.daddr = iph->daddr;
1189                 fl4.saddr = iph->saddr;
1190                 fl4.flowi4_tos = RT_TOS(iph->tos);
1191                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1192                 fl4.flowi4_iif = skb->dev->ifindex;
1193                 fl4.flowi4_mark = skb->mark;
1194
1195                 rcu_read_lock();
1196                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1197                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1198                 else
1199                         src = inet_select_addr(rt->dst.dev,
1200                                                rt_nexthop(rt, iph->daddr),
1201                                                RT_SCOPE_UNIVERSE);
1202                 rcu_read_unlock();
1203         }
1204         memcpy(addr, &src, 4);
1205 }
1206
1207 #ifdef CONFIG_IP_ROUTE_CLASSID
1208 static void set_class_tag(struct rtable *rt, u32 tag)
1209 {
1210         if (!(rt->dst.tclassid & 0xFFFF))
1211                 rt->dst.tclassid |= tag & 0xFFFF;
1212         if (!(rt->dst.tclassid & 0xFFFF0000))
1213                 rt->dst.tclassid |= tag & 0xFFFF0000;
1214 }
1215 #endif
1216
1217 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1218 {
1219         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1220
1221         if (advmss == 0) {
1222                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1223                                ip_rt_min_advmss);
1224                 if (advmss > 65535 - 40)
1225                         advmss = 65535 - 40;
1226         }
1227         return advmss;
1228 }
1229
1230 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1231 {
1232         const struct rtable *rt = (const struct rtable *) dst;
1233         unsigned int mtu = rt->rt_pmtu;
1234
1235         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1236                 mtu = dst_metric_raw(dst, RTAX_MTU);
1237
1238         if (mtu)
1239                 return mtu;
1240
1241         mtu = dst->dev->mtu;
1242
1243         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1244                 if (rt->rt_uses_gateway && mtu > 576)
1245                         mtu = 576;
1246         }
1247
1248         return min_t(unsigned int, mtu, IP_MAX_MTU);
1249 }
1250
1251 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1252 {
1253         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1254         struct fib_nh_exception *fnhe;
1255         u32 hval;
1256
1257         if (!hash)
1258                 return NULL;
1259
1260         hval = fnhe_hashfun(daddr);
1261
1262         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1263              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1264                 if (fnhe->fnhe_daddr == daddr)
1265                         return fnhe;
1266         }
1267         return NULL;
1268 }
1269
1270 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1271                               __be32 daddr)
1272 {
1273         bool ret = false;
1274
1275         spin_lock_bh(&fnhe_lock);
1276
1277         if (daddr == fnhe->fnhe_daddr) {
1278                 struct rtable __rcu **porig;
1279                 struct rtable *orig;
1280                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1281
1282                 if (rt_is_input_route(rt))
1283                         porig = &fnhe->fnhe_rth_input;
1284                 else
1285                         porig = &fnhe->fnhe_rth_output;
1286                 orig = rcu_dereference(*porig);
1287
1288                 if (fnhe->fnhe_genid != genid) {
1289                         fnhe->fnhe_genid = genid;
1290                         fnhe->fnhe_gw = 0;
1291                         fnhe->fnhe_pmtu = 0;
1292                         fnhe->fnhe_expires = 0;
1293                         fnhe_flush_routes(fnhe);
1294                         orig = NULL;
1295                 }
1296                 fill_route_from_fnhe(rt, fnhe);
1297                 if (!rt->rt_gateway)
1298                         rt->rt_gateway = daddr;
1299
1300                 if (!(rt->dst.flags & DST_NOCACHE)) {
1301                         rcu_assign_pointer(*porig, rt);
1302                         if (orig)
1303                                 rt_free(orig);
1304                         ret = true;
1305                 }
1306
1307                 fnhe->fnhe_stamp = jiffies;
1308         }
1309         spin_unlock_bh(&fnhe_lock);
1310
1311         return ret;
1312 }
1313
1314 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1315 {
1316         struct rtable *orig, *prev, **p;
1317         bool ret = true;
1318
1319         if (rt_is_input_route(rt)) {
1320                 p = (struct rtable **)&nh->nh_rth_input;
1321         } else {
1322                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1323         }
1324         orig = *p;
1325
1326         prev = cmpxchg(p, orig, rt);
1327         if (prev == orig) {
1328                 if (orig)
1329                         rt_free(orig);
1330         } else
1331                 ret = false;
1332
1333         return ret;
1334 }
1335
1336 struct uncached_list {
1337         spinlock_t              lock;
1338         struct list_head        head;
1339 };
1340
1341 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1342
1343 static void rt_add_uncached_list(struct rtable *rt)
1344 {
1345         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1346
1347         rt->rt_uncached_list = ul;
1348
1349         spin_lock_bh(&ul->lock);
1350         list_add_tail(&rt->rt_uncached, &ul->head);
1351         spin_unlock_bh(&ul->lock);
1352 }
1353
1354 static void ipv4_dst_destroy(struct dst_entry *dst)
1355 {
1356         struct rtable *rt = (struct rtable *) dst;
1357
1358         if (!list_empty(&rt->rt_uncached)) {
1359                 struct uncached_list *ul = rt->rt_uncached_list;
1360
1361                 spin_lock_bh(&ul->lock);
1362                 list_del(&rt->rt_uncached);
1363                 spin_unlock_bh(&ul->lock);
1364         }
1365 }
1366
1367 void rt_flush_dev(struct net_device *dev)
1368 {
1369         struct net *net = dev_net(dev);
1370         struct rtable *rt;
1371         int cpu;
1372
1373         for_each_possible_cpu(cpu) {
1374                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1375
1376                 spin_lock_bh(&ul->lock);
1377                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1378                         if (rt->dst.dev != dev)
1379                                 continue;
1380                         rt->dst.dev = net->loopback_dev;
1381                         dev_hold(rt->dst.dev);
1382                         dev_put(dev);
1383                 }
1384                 spin_unlock_bh(&ul->lock);
1385         }
1386 }
1387
1388 static bool rt_cache_valid(const struct rtable *rt)
1389 {
1390         return  rt &&
1391                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1392                 !rt_is_expired(rt);
1393 }
1394
1395 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1396                            const struct fib_result *res,
1397                            struct fib_nh_exception *fnhe,
1398                            struct fib_info *fi, u16 type, u32 itag)
1399 {
1400         bool cached = false;
1401
1402         if (fi) {
1403                 struct fib_nh *nh = &FIB_RES_NH(*res);
1404
1405                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1406                         rt->rt_gateway = nh->nh_gw;
1407                         rt->rt_uses_gateway = 1;
1408                 }
1409                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1410 #ifdef CONFIG_IP_ROUTE_CLASSID
1411                 rt->dst.tclassid = nh->nh_tclassid;
1412 #endif
1413                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1414                 if (unlikely(fnhe))
1415                         cached = rt_bind_exception(rt, fnhe, daddr);
1416                 else if (!(rt->dst.flags & DST_NOCACHE))
1417                         cached = rt_cache_route(nh, rt);
1418                 if (unlikely(!cached)) {
1419                         /* Routes we intend to cache in nexthop exception or
1420                          * FIB nexthop have the DST_NOCACHE bit clear.
1421                          * However, if we are unsuccessful at storing this
1422                          * route into the cache we really need to set it.
1423                          */
1424                         rt->dst.flags |= DST_NOCACHE;
1425                         if (!rt->rt_gateway)
1426                                 rt->rt_gateway = daddr;
1427                         rt_add_uncached_list(rt);
1428                 }
1429         } else
1430                 rt_add_uncached_list(rt);
1431
1432 #ifdef CONFIG_IP_ROUTE_CLASSID
1433 #ifdef CONFIG_IP_MULTIPLE_TABLES
1434         set_class_tag(rt, res->tclassid);
1435 #endif
1436         set_class_tag(rt, itag);
1437 #endif
1438 }
1439
1440 static struct rtable *rt_dst_alloc(struct net_device *dev,
1441                                    unsigned int flags, u16 type,
1442                                    bool nopolicy, bool noxfrm, bool will_cache)
1443 {
1444         struct rtable *rt;
1445
1446         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1447                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1448                        (nopolicy ? DST_NOPOLICY : 0) |
1449                        (noxfrm ? DST_NOXFRM : 0));
1450
1451         if (rt) {
1452                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1453                 rt->rt_flags = flags;
1454                 rt->rt_type = type;
1455                 rt->rt_is_input = 0;
1456                 rt->rt_iif = 0;
1457                 rt->rt_pmtu = 0;
1458                 rt->rt_gateway = 0;
1459                 rt->rt_uses_gateway = 0;
1460                 rt->rt_table_id = 0;
1461                 INIT_LIST_HEAD(&rt->rt_uncached);
1462
1463                 rt->dst.output = ip_output;
1464                 if (flags & RTCF_LOCAL)
1465                         rt->dst.input = ip_local_deliver;
1466         }
1467
1468         return rt;
1469 }
1470
1471 /* called in rcu_read_lock() section */
1472 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1473                                 u8 tos, struct net_device *dev, int our)
1474 {
1475         struct rtable *rth;
1476         struct in_device *in_dev = __in_dev_get_rcu(dev);
1477         unsigned int flags = RTCF_MULTICAST;
1478         u32 itag = 0;
1479         int err;
1480
1481         /* Primary sanity checks. */
1482
1483         if (!in_dev)
1484                 return -EINVAL;
1485
1486         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1487             skb->protocol != htons(ETH_P_IP))
1488                 goto e_inval;
1489
1490         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1491                 goto e_inval;
1492
1493         if (ipv4_is_zeronet(saddr)) {
1494                 if (!ipv4_is_local_multicast(daddr))
1495                         goto e_inval;
1496         } else {
1497                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1498                                           in_dev, &itag);
1499                 if (err < 0)
1500                         goto e_err;
1501         }
1502         if (our)
1503                 flags |= RTCF_LOCAL;
1504
1505         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1506                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1507         if (!rth)
1508                 goto e_nobufs;
1509
1510 #ifdef CONFIG_IP_ROUTE_CLASSID
1511         rth->dst.tclassid = itag;
1512 #endif
1513         rth->dst.output = ip_rt_bug;
1514         rth->rt_is_input= 1;
1515
1516 #ifdef CONFIG_IP_MROUTE
1517         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1518                 rth->dst.input = ip_mr_input;
1519 #endif
1520         RT_CACHE_STAT_INC(in_slow_mc);
1521
1522         skb_dst_set(skb, &rth->dst);
1523         return 0;
1524
1525 e_nobufs:
1526         return -ENOBUFS;
1527 e_inval:
1528         return -EINVAL;
1529 e_err:
1530         return err;
1531 }
1532
1533
1534 static void ip_handle_martian_source(struct net_device *dev,
1535                                      struct in_device *in_dev,
1536                                      struct sk_buff *skb,
1537                                      __be32 daddr,
1538                                      __be32 saddr)
1539 {
1540         RT_CACHE_STAT_INC(in_martian_src);
1541 #ifdef CONFIG_IP_ROUTE_VERBOSE
1542         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1543                 /*
1544                  *      RFC1812 recommendation, if source is martian,
1545                  *      the only hint is MAC header.
1546                  */
1547                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1548                         &daddr, &saddr, dev->name);
1549                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1550                         print_hex_dump(KERN_WARNING, "ll header: ",
1551                                        DUMP_PREFIX_OFFSET, 16, 1,
1552                                        skb_mac_header(skb),
1553                                        dev->hard_header_len, true);
1554                 }
1555         }
1556 #endif
1557 }
1558
1559 /* called in rcu_read_lock() section */
1560 static int __mkroute_input(struct sk_buff *skb,
1561                            const struct fib_result *res,
1562                            struct in_device *in_dev,
1563                            __be32 daddr, __be32 saddr, u32 tos)
1564 {
1565         struct fib_nh_exception *fnhe;
1566         struct rtable *rth;
1567         int err;
1568         struct in_device *out_dev;
1569         bool do_cache;
1570         u32 itag = 0;
1571
1572         /* get a working reference to the output device */
1573         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1574         if (!out_dev) {
1575                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1576                 return -EINVAL;
1577         }
1578
1579         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1580                                   in_dev->dev, in_dev, &itag);
1581         if (err < 0) {
1582                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1583                                          saddr);
1584
1585                 goto cleanup;
1586         }
1587
1588         do_cache = res->fi && !itag;
1589         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1590             skb->protocol == htons(ETH_P_IP) &&
1591             (IN_DEV_SHARED_MEDIA(out_dev) ||
1592              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1593                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1594
1595         if (skb->protocol != htons(ETH_P_IP)) {
1596                 /* Not IP (i.e. ARP). Do not create route, if it is
1597                  * invalid for proxy arp. DNAT routes are always valid.
1598                  *
1599                  * Proxy arp feature have been extended to allow, ARP
1600                  * replies back to the same interface, to support
1601                  * Private VLAN switch technologies. See arp.c.
1602                  */
1603                 if (out_dev == in_dev &&
1604                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1605                         err = -EINVAL;
1606                         goto cleanup;
1607                 }
1608         }
1609
1610         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1611         if (do_cache) {
1612                 if (fnhe)
1613                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1614                 else
1615                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1616
1617                 if (rt_cache_valid(rth)) {
1618                         skb_dst_set_noref(skb, &rth->dst);
1619                         goto out;
1620                 }
1621         }
1622
1623         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1624                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1625                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1626         if (!rth) {
1627                 err = -ENOBUFS;
1628                 goto cleanup;
1629         }
1630
1631         rth->rt_is_input = 1;
1632         if (res->table)
1633                 rth->rt_table_id = res->table->tb_id;
1634         RT_CACHE_STAT_INC(in_slow_tot);
1635
1636         rth->dst.input = ip_forward;
1637
1638         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1639         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1640                 rth->dst.lwtstate->orig_output = rth->dst.output;
1641                 rth->dst.output = lwtunnel_output;
1642         }
1643         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1644                 rth->dst.lwtstate->orig_input = rth->dst.input;
1645                 rth->dst.input = lwtunnel_input;
1646         }
1647         skb_dst_set(skb, &rth->dst);
1648 out:
1649         err = 0;
1650  cleanup:
1651         return err;
1652 }
1653
1654 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1655
1656 /* To make ICMP packets follow the right flow, the multipath hash is
1657  * calculated from the inner IP addresses in reverse order.
1658  */
1659 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1660 {
1661         const struct iphdr *outer_iph = ip_hdr(skb);
1662         struct icmphdr _icmph;
1663         const struct icmphdr *icmph;
1664         struct iphdr _inner_iph;
1665         const struct iphdr *inner_iph;
1666
1667         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1668                 goto standard_hash;
1669
1670         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1671                                    &_icmph);
1672         if (!icmph)
1673                 goto standard_hash;
1674
1675         if (icmph->type != ICMP_DEST_UNREACH &&
1676             icmph->type != ICMP_REDIRECT &&
1677             icmph->type != ICMP_TIME_EXCEEDED &&
1678             icmph->type != ICMP_PARAMETERPROB) {
1679                 goto standard_hash;
1680         }
1681
1682         inner_iph = skb_header_pointer(skb,
1683                                        outer_iph->ihl * 4 + sizeof(_icmph),
1684                                        sizeof(_inner_iph), &_inner_iph);
1685         if (!inner_iph)
1686                 goto standard_hash;
1687
1688         return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1689
1690 standard_hash:
1691         return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1692 }
1693
1694 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1695
1696 static int ip_mkroute_input(struct sk_buff *skb,
1697                             struct fib_result *res,
1698                             const struct flowi4 *fl4,
1699                             struct in_device *in_dev,
1700                             __be32 daddr, __be32 saddr, u32 tos)
1701 {
1702 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1703         if (res->fi && res->fi->fib_nhs > 1) {
1704                 int h;
1705
1706                 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1707                         h = ip_multipath_icmp_hash(skb);
1708                 else
1709                         h = fib_multipath_hash(saddr, daddr);
1710                 fib_select_multipath(res, h);
1711         }
1712 #endif
1713
1714         /* create a routing cache entry */
1715         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1716 }
1717
1718 /*
1719  *      NOTE. We drop all the packets that has local source
1720  *      addresses, because every properly looped back packet
1721  *      must have correct destination already attached by output routine.
1722  *
1723  *      Such approach solves two big problems:
1724  *      1. Not simplex devices are handled properly.
1725  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1726  *      called with rcu_read_lock()
1727  */
1728
1729 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1730                                u8 tos, struct net_device *dev)
1731 {
1732         struct fib_result res;
1733         struct in_device *in_dev = __in_dev_get_rcu(dev);
1734         struct ip_tunnel_info *tun_info;
1735         struct flowi4   fl4;
1736         unsigned int    flags = 0;
1737         u32             itag = 0;
1738         struct rtable   *rth;
1739         int             err = -EINVAL;
1740         struct net    *net = dev_net(dev);
1741         bool do_cache;
1742
1743         res.table = 0;
1744
1745         /* IP on this device is disabled. */
1746
1747         if (!in_dev)
1748                 goto out;
1749
1750         /* Check for the most weird martians, which can be not detected
1751            by fib_lookup.
1752          */
1753
1754         tun_info = skb_tunnel_info(skb);
1755         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1756                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1757         else
1758                 fl4.flowi4_tun_key.tun_id = 0;
1759         skb_dst_drop(skb);
1760
1761         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1762                 goto martian_source;
1763
1764         res.fi = NULL;
1765         res.table = NULL;
1766         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1767                 goto brd_input;
1768
1769         /* Accept zero addresses only to limited broadcast;
1770          * I even do not know to fix it or not. Waiting for complains :-)
1771          */
1772         if (ipv4_is_zeronet(saddr))
1773                 goto martian_source;
1774
1775         if (ipv4_is_zeronet(daddr))
1776                 goto martian_destination;
1777
1778         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1779          * and call it once if daddr or/and saddr are loopback addresses
1780          */
1781         if (ipv4_is_loopback(daddr)) {
1782                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1783                         goto martian_destination;
1784         } else if (ipv4_is_loopback(saddr)) {
1785                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1786                         goto martian_source;
1787         }
1788
1789         /*
1790          *      Now we are ready to route packet.
1791          */
1792         fl4.flowi4_oif = 0;
1793         fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
1794         fl4.flowi4_mark = skb->mark;
1795         fl4.flowi4_tos = tos;
1796         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1797         fl4.flowi4_flags = 0;
1798         fl4.daddr = daddr;
1799         fl4.saddr = saddr;
1800         err = fib_lookup(net, &fl4, &res, 0);
1801         if (err != 0) {
1802                 if (!IN_DEV_FORWARD(in_dev))
1803                         err = -EHOSTUNREACH;
1804                 goto no_route;
1805         }
1806
1807         if (res.type == RTN_BROADCAST)
1808                 goto brd_input;
1809
1810         if (res.type == RTN_LOCAL) {
1811                 err = fib_validate_source(skb, saddr, daddr, tos,
1812                                           0, dev, in_dev, &itag);
1813                 if (err < 0)
1814                         goto martian_source;
1815                 goto local_input;
1816         }
1817
1818         if (!IN_DEV_FORWARD(in_dev)) {
1819                 err = -EHOSTUNREACH;
1820                 goto no_route;
1821         }
1822         if (res.type != RTN_UNICAST)
1823                 goto martian_destination;
1824
1825         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1826 out:    return err;
1827
1828 brd_input:
1829         if (skb->protocol != htons(ETH_P_IP))
1830                 goto e_inval;
1831
1832         if (!ipv4_is_zeronet(saddr)) {
1833                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1834                                           in_dev, &itag);
1835                 if (err < 0)
1836                         goto martian_source;
1837         }
1838         flags |= RTCF_BROADCAST;
1839         res.type = RTN_BROADCAST;
1840         RT_CACHE_STAT_INC(in_brd);
1841
1842 local_input:
1843         do_cache = false;
1844         if (res.fi) {
1845                 if (!itag) {
1846                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1847                         if (rt_cache_valid(rth)) {
1848                                 skb_dst_set_noref(skb, &rth->dst);
1849                                 err = 0;
1850                                 goto out;
1851                         }
1852                         do_cache = true;
1853                 }
1854         }
1855
1856         rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1857                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1858         if (!rth)
1859                 goto e_nobufs;
1860
1861         rth->dst.output= ip_rt_bug;
1862 #ifdef CONFIG_IP_ROUTE_CLASSID
1863         rth->dst.tclassid = itag;
1864 #endif
1865         rth->rt_is_input = 1;
1866         if (res.table)
1867                 rth->rt_table_id = res.table->tb_id;
1868
1869         RT_CACHE_STAT_INC(in_slow_tot);
1870         if (res.type == RTN_UNREACHABLE) {
1871                 rth->dst.input= ip_error;
1872                 rth->dst.error= -err;
1873                 rth->rt_flags   &= ~RTCF_LOCAL;
1874         }
1875         if (do_cache) {
1876                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1877                         rth->dst.flags |= DST_NOCACHE;
1878                         rt_add_uncached_list(rth);
1879                 }
1880         }
1881         skb_dst_set(skb, &rth->dst);
1882         err = 0;
1883         goto out;
1884
1885 no_route:
1886         RT_CACHE_STAT_INC(in_no_route);
1887         res.type = RTN_UNREACHABLE;
1888         res.fi = NULL;
1889         res.table = NULL;
1890         goto local_input;
1891
1892         /*
1893          *      Do not cache martian addresses: they should be logged (RFC1812)
1894          */
1895 martian_destination:
1896         RT_CACHE_STAT_INC(in_martian_dst);
1897 #ifdef CONFIG_IP_ROUTE_VERBOSE
1898         if (IN_DEV_LOG_MARTIANS(in_dev))
1899                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1900                                      &daddr, &saddr, dev->name);
1901 #endif
1902
1903 e_inval:
1904         err = -EINVAL;
1905         goto out;
1906
1907 e_nobufs:
1908         err = -ENOBUFS;
1909         goto out;
1910
1911 martian_source:
1912         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1913         goto out;
1914 }
1915
1916 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1917                          u8 tos, struct net_device *dev)
1918 {
1919         int res;
1920
1921         rcu_read_lock();
1922
1923         /* Multicast recognition logic is moved from route cache to here.
1924            The problem was that too many Ethernet cards have broken/missing
1925            hardware multicast filters :-( As result the host on multicasting
1926            network acquires a lot of useless route cache entries, sort of
1927            SDR messages from all the world. Now we try to get rid of them.
1928            Really, provided software IP multicast filter is organized
1929            reasonably (at least, hashed), it does not result in a slowdown
1930            comparing with route cache reject entries.
1931            Note, that multicast routers are not affected, because
1932            route cache entry is created eventually.
1933          */
1934         if (ipv4_is_multicast(daddr)) {
1935                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1936
1937                 if (in_dev) {
1938                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1939                                                   ip_hdr(skb)->protocol);
1940                         if (our
1941 #ifdef CONFIG_IP_MROUTE
1942                                 ||
1943                             (!ipv4_is_local_multicast(daddr) &&
1944                              IN_DEV_MFORWARD(in_dev))
1945 #endif
1946                            ) {
1947                                 int res = ip_route_input_mc(skb, daddr, saddr,
1948                                                             tos, dev, our);
1949                                 rcu_read_unlock();
1950                                 return res;
1951                         }
1952                 }
1953                 rcu_read_unlock();
1954                 return -EINVAL;
1955         }
1956         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1957         rcu_read_unlock();
1958         return res;
1959 }
1960 EXPORT_SYMBOL(ip_route_input_noref);
1961
1962 /* called with rcu_read_lock() */
1963 static struct rtable *__mkroute_output(const struct fib_result *res,
1964                                        const struct flowi4 *fl4, int orig_oif,
1965                                        struct net_device *dev_out,
1966                                        unsigned int flags)
1967 {
1968         struct fib_info *fi = res->fi;
1969         struct fib_nh_exception *fnhe;
1970         struct in_device *in_dev;
1971         u16 type = res->type;
1972         struct rtable *rth;
1973         bool do_cache;
1974
1975         in_dev = __in_dev_get_rcu(dev_out);
1976         if (!in_dev)
1977                 return ERR_PTR(-EINVAL);
1978
1979         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1980                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1981                         return ERR_PTR(-EINVAL);
1982
1983         if (ipv4_is_lbcast(fl4->daddr))
1984                 type = RTN_BROADCAST;
1985         else if (ipv4_is_multicast(fl4->daddr))
1986                 type = RTN_MULTICAST;
1987         else if (ipv4_is_zeronet(fl4->daddr))
1988                 return ERR_PTR(-EINVAL);
1989
1990         if (dev_out->flags & IFF_LOOPBACK)
1991                 flags |= RTCF_LOCAL;
1992
1993         do_cache = true;
1994         if (type == RTN_BROADCAST) {
1995                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1996                 fi = NULL;
1997         } else if (type == RTN_MULTICAST) {
1998                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1999                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2000                                      fl4->flowi4_proto))
2001                         flags &= ~RTCF_LOCAL;
2002                 else
2003                         do_cache = false;
2004                 /* If multicast route do not exist use
2005                  * default one, but do not gateway in this case.
2006                  * Yes, it is hack.
2007                  */
2008                 if (fi && res->prefixlen < 4)
2009                         fi = NULL;
2010         }
2011
2012         fnhe = NULL;
2013         do_cache &= fi != NULL;
2014         if (do_cache) {
2015                 struct rtable __rcu **prth;
2016                 struct fib_nh *nh = &FIB_RES_NH(*res);
2017
2018                 fnhe = find_exception(nh, fl4->daddr);
2019                 if (fnhe)
2020                         prth = &fnhe->fnhe_rth_output;
2021                 else {
2022                         if (unlikely(fl4->flowi4_flags &
2023                                      FLOWI_FLAG_KNOWN_NH &&
2024                                      !(nh->nh_gw &&
2025                                        nh->nh_scope == RT_SCOPE_LINK))) {
2026                                 do_cache = false;
2027                                 goto add;
2028                         }
2029                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2030                 }
2031                 rth = rcu_dereference(*prth);
2032                 if (rt_cache_valid(rth)) {
2033                         dst_hold(&rth->dst);
2034                         return rth;
2035                 }
2036         }
2037
2038 add:
2039         rth = rt_dst_alloc(dev_out, flags, type,
2040                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2041                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2042                            do_cache);
2043         if (!rth)
2044                 return ERR_PTR(-ENOBUFS);
2045
2046         rth->rt_iif     = orig_oif ? : 0;
2047         if (res->table)
2048                 rth->rt_table_id = res->table->tb_id;
2049
2050         RT_CACHE_STAT_INC(out_slow_tot);
2051
2052         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2053                 if (flags & RTCF_LOCAL &&
2054                     !(dev_out->flags & IFF_LOOPBACK)) {
2055                         rth->dst.output = ip_mc_output;
2056                         RT_CACHE_STAT_INC(out_slow_mc);
2057                 }
2058 #ifdef CONFIG_IP_MROUTE
2059                 if (type == RTN_MULTICAST) {
2060                         if (IN_DEV_MFORWARD(in_dev) &&
2061                             !ipv4_is_local_multicast(fl4->daddr)) {
2062                                 rth->dst.input = ip_mr_input;
2063                                 rth->dst.output = ip_mc_output;
2064                         }
2065                 }
2066 #endif
2067         }
2068
2069         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2070         if (lwtunnel_output_redirect(rth->dst.lwtstate))
2071                 rth->dst.output = lwtunnel_output;
2072
2073         return rth;
2074 }
2075
2076 /*
2077  * Major route resolver routine.
2078  */
2079
2080 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2081                                           int mp_hash)
2082 {
2083         struct net_device *dev_out = NULL;
2084         __u8 tos = RT_FL_TOS(fl4);
2085         unsigned int flags = 0;
2086         struct fib_result res;
2087         struct rtable *rth;
2088         int orig_oif;
2089         int err = -ENETUNREACH;
2090
2091         res.tclassid    = 0;
2092         res.fi          = NULL;
2093         res.table       = NULL;
2094
2095         orig_oif = fl4->flowi4_oif;
2096
2097         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2098         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2099         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2100                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2101
2102         rcu_read_lock();
2103         if (fl4->saddr) {
2104                 rth = ERR_PTR(-EINVAL);
2105                 if (ipv4_is_multicast(fl4->saddr) ||
2106                     ipv4_is_lbcast(fl4->saddr) ||
2107                     ipv4_is_zeronet(fl4->saddr))
2108                         goto out;
2109
2110                 /* I removed check for oif == dev_out->oif here.
2111                    It was wrong for two reasons:
2112                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2113                       is assigned to multiple interfaces.
2114                    2. Moreover, we are allowed to send packets with saddr
2115                       of another iface. --ANK
2116                  */
2117
2118                 if (fl4->flowi4_oif == 0 &&
2119                     (ipv4_is_multicast(fl4->daddr) ||
2120                      ipv4_is_lbcast(fl4->daddr))) {
2121                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2122                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2123                         if (!dev_out)
2124                                 goto out;
2125
2126                         /* Special hack: user can direct multicasts
2127                            and limited broadcast via necessary interface
2128                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2129                            This hack is not just for fun, it allows
2130                            vic,vat and friends to work.
2131                            They bind socket to loopback, set ttl to zero
2132                            and expect that it will work.
2133                            From the viewpoint of routing cache they are broken,
2134                            because we are not allowed to build multicast path
2135                            with loopback source addr (look, routing cache
2136                            cannot know, that ttl is zero, so that packet
2137                            will not leave this host and route is valid).
2138                            Luckily, this hack is good workaround.
2139                          */
2140
2141                         fl4->flowi4_oif = dev_out->ifindex;
2142                         goto make_route;
2143                 }
2144
2145                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2146                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2147                         if (!__ip_dev_find(net, fl4->saddr, false))
2148                                 goto out;
2149                 }
2150         }
2151
2152
2153         if (fl4->flowi4_oif) {
2154                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2155                 rth = ERR_PTR(-ENODEV);
2156                 if (!dev_out)
2157                         goto out;
2158
2159                 /* RACE: Check return value of inet_select_addr instead. */
2160                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2161                         rth = ERR_PTR(-ENETUNREACH);
2162                         goto out;
2163                 }
2164                 if (ipv4_is_local_multicast(fl4->daddr) ||
2165                     ipv4_is_lbcast(fl4->daddr) ||
2166                     fl4->flowi4_proto == IPPROTO_IGMP) {
2167                         if (!fl4->saddr)
2168                                 fl4->saddr = inet_select_addr(dev_out, 0,
2169                                                               RT_SCOPE_LINK);
2170                         goto make_route;
2171                 }
2172                 if (!fl4->saddr) {
2173                         if (ipv4_is_multicast(fl4->daddr))
2174                                 fl4->saddr = inet_select_addr(dev_out, 0,
2175                                                               fl4->flowi4_scope);
2176                         else if (!fl4->daddr)
2177                                 fl4->saddr = inet_select_addr(dev_out, 0,
2178                                                               RT_SCOPE_HOST);
2179                 }
2180
2181                 rth = l3mdev_get_rtable(dev_out, fl4);
2182                 if (rth)
2183                         goto out;
2184         }
2185
2186         if (!fl4->daddr) {
2187                 fl4->daddr = fl4->saddr;
2188                 if (!fl4->daddr)
2189                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2190                 dev_out = net->loopback_dev;
2191                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2192                 res.type = RTN_LOCAL;
2193                 flags |= RTCF_LOCAL;
2194                 goto make_route;
2195         }
2196
2197         err = fib_lookup(net, fl4, &res, 0);
2198         if (err) {
2199                 res.fi = NULL;
2200                 res.table = NULL;
2201                 if (fl4->flowi4_oif &&
2202                     !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2203                         /* Apparently, routing tables are wrong. Assume,
2204                            that the destination is on link.
2205
2206                            WHY? DW.
2207                            Because we are allowed to send to iface
2208                            even if it has NO routes and NO assigned
2209                            addresses. When oif is specified, routing
2210                            tables are looked up with only one purpose:
2211                            to catch if destination is gatewayed, rather than
2212                            direct. Moreover, if MSG_DONTROUTE is set,
2213                            we send packet, ignoring both routing tables
2214                            and ifaddr state. --ANK
2215
2216
2217                            We could make it even if oif is unknown,
2218                            likely IPv6, but we do not.
2219                          */
2220
2221                         if (fl4->saddr == 0)
2222                                 fl4->saddr = inet_select_addr(dev_out, 0,
2223                                                               RT_SCOPE_LINK);
2224                         res.type = RTN_UNICAST;
2225                         goto make_route;
2226                 }
2227                 rth = ERR_PTR(err);
2228                 goto out;
2229         }
2230
2231         if (res.type == RTN_LOCAL) {
2232                 if (!fl4->saddr) {
2233                         if (res.fi->fib_prefsrc)
2234                                 fl4->saddr = res.fi->fib_prefsrc;
2235                         else
2236                                 fl4->saddr = fl4->daddr;
2237                 }
2238                 dev_out = net->loopback_dev;
2239                 fl4->flowi4_oif = dev_out->ifindex;
2240                 flags |= RTCF_LOCAL;
2241                 goto make_route;
2242         }
2243
2244         fib_select_path(net, &res, fl4, mp_hash);
2245
2246         dev_out = FIB_RES_DEV(res);
2247         fl4->flowi4_oif = dev_out->ifindex;
2248
2249
2250 make_route:
2251         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2252
2253 out:
2254         rcu_read_unlock();
2255         return rth;
2256 }
2257 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2258
2259 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2260 {
2261         return NULL;
2262 }
2263
2264 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2265 {
2266         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2267
2268         return mtu ? : dst->dev->mtu;
2269 }
2270
2271 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2272                                           struct sk_buff *skb, u32 mtu)
2273 {
2274 }
2275
2276 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2277                                        struct sk_buff *skb)
2278 {
2279 }
2280
2281 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2282                                           unsigned long old)
2283 {
2284         return NULL;
2285 }
2286
2287 static struct dst_ops ipv4_dst_blackhole_ops = {
2288         .family                 =       AF_INET,
2289         .check                  =       ipv4_blackhole_dst_check,
2290         .mtu                    =       ipv4_blackhole_mtu,
2291         .default_advmss         =       ipv4_default_advmss,
2292         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2293         .redirect               =       ipv4_rt_blackhole_redirect,
2294         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2295         .neigh_lookup           =       ipv4_neigh_lookup,
2296 };
2297
2298 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2299 {
2300         struct rtable *ort = (struct rtable *) dst_orig;
2301         struct rtable *rt;
2302
2303         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2304         if (rt) {
2305                 struct dst_entry *new = &rt->dst;
2306
2307                 new->__use = 1;
2308                 new->input = dst_discard;
2309                 new->output = dst_discard_out;
2310
2311                 new->dev = ort->dst.dev;
2312                 if (new->dev)
2313                         dev_hold(new->dev);
2314
2315                 rt->rt_is_input = ort->rt_is_input;
2316                 rt->rt_iif = ort->rt_iif;
2317                 rt->rt_pmtu = ort->rt_pmtu;
2318
2319                 rt->rt_genid = rt_genid_ipv4(net);
2320                 rt->rt_flags = ort->rt_flags;
2321                 rt->rt_type = ort->rt_type;
2322                 rt->rt_gateway = ort->rt_gateway;
2323                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2324
2325                 INIT_LIST_HEAD(&rt->rt_uncached);
2326                 dst_free(new);
2327         }
2328
2329         dst_release(dst_orig);
2330
2331         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2332 }
2333
2334 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2335                                     const struct sock *sk)
2336 {
2337         struct rtable *rt = __ip_route_output_key(net, flp4);
2338
2339         if (IS_ERR(rt))
2340                 return rt;
2341
2342         if (flp4->flowi4_proto)
2343                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2344                                                         flowi4_to_flowi(flp4),
2345                                                         sk, 0);
2346
2347         return rt;
2348 }
2349 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2350
2351 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2352                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2353                         u32 seq, int event, int nowait, unsigned int flags)
2354 {
2355         struct rtable *rt = skb_rtable(skb);
2356         struct rtmsg *r;
2357         struct nlmsghdr *nlh;
2358         unsigned long expires = 0;
2359         u32 error;
2360         u32 metrics[RTAX_MAX];
2361
2362         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2363         if (!nlh)
2364                 return -EMSGSIZE;
2365
2366         r = nlmsg_data(nlh);
2367         r->rtm_family    = AF_INET;
2368         r->rtm_dst_len  = 32;
2369         r->rtm_src_len  = 0;
2370         r->rtm_tos      = fl4->flowi4_tos;
2371         r->rtm_table    = table_id;
2372         if (nla_put_u32(skb, RTA_TABLE, table_id))
2373                 goto nla_put_failure;
2374         r->rtm_type     = rt->rt_type;
2375         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2376         r->rtm_protocol = RTPROT_UNSPEC;
2377         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2378         if (rt->rt_flags & RTCF_NOTIFY)
2379                 r->rtm_flags |= RTM_F_NOTIFY;
2380         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2381                 r->rtm_flags |= RTCF_DOREDIRECT;
2382
2383         if (nla_put_in_addr(skb, RTA_DST, dst))
2384                 goto nla_put_failure;
2385         if (src) {
2386                 r->rtm_src_len = 32;
2387                 if (nla_put_in_addr(skb, RTA_SRC, src))
2388                         goto nla_put_failure;
2389         }
2390         if (rt->dst.dev &&
2391             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2392                 goto nla_put_failure;
2393 #ifdef CONFIG_IP_ROUTE_CLASSID
2394         if (rt->dst.tclassid &&
2395             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2396                 goto nla_put_failure;
2397 #endif
2398         if (!rt_is_input_route(rt) &&
2399             fl4->saddr != src) {
2400                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2401                         goto nla_put_failure;
2402         }
2403         if (rt->rt_uses_gateway &&
2404             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2405                 goto nla_put_failure;
2406
2407         expires = rt->dst.expires;
2408         if (expires) {
2409                 unsigned long now = jiffies;
2410
2411                 if (time_before(now, expires))
2412                         expires -= now;
2413                 else
2414                         expires = 0;
2415         }
2416
2417         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2418         if (rt->rt_pmtu && expires)
2419                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2420         if (rtnetlink_put_metrics(skb, metrics) < 0)
2421                 goto nla_put_failure;
2422
2423         if (fl4->flowi4_mark &&
2424             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2425                 goto nla_put_failure;
2426
2427         error = rt->dst.error;
2428
2429         if (rt_is_input_route(rt)) {
2430 #ifdef CONFIG_IP_MROUTE
2431                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2432                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2433                         int err = ipmr_get_route(net, skb,
2434                                                  fl4->saddr, fl4->daddr,
2435                                                  r, nowait);
2436                         if (err <= 0) {
2437                                 if (!nowait) {
2438                                         if (err == 0)
2439                                                 return 0;
2440                                         goto nla_put_failure;
2441                                 } else {
2442                                         if (err == -EMSGSIZE)
2443                                                 goto nla_put_failure;
2444                                         error = err;
2445                                 }
2446                         }
2447                 } else
2448 #endif
2449                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2450                                 goto nla_put_failure;
2451         }
2452
2453         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2454                 goto nla_put_failure;
2455
2456         nlmsg_end(skb, nlh);
2457         return 0;
2458
2459 nla_put_failure:
2460         nlmsg_cancel(skb, nlh);
2461         return -EMSGSIZE;
2462 }
2463
2464 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2465 {
2466         struct net *net = sock_net(in_skb->sk);
2467         struct rtmsg *rtm;
2468         struct nlattr *tb[RTA_MAX+1];
2469         struct rtable *rt = NULL;
2470         struct flowi4 fl4;
2471         __be32 dst = 0;
2472         __be32 src = 0;
2473         u32 iif;
2474         int err;
2475         int mark;
2476         struct sk_buff *skb;
2477         u32 table_id = RT_TABLE_MAIN;
2478
2479         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2480         if (err < 0)
2481                 goto errout;
2482
2483         rtm = nlmsg_data(nlh);
2484
2485         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2486         if (!skb) {
2487                 err = -ENOBUFS;
2488                 goto errout;
2489         }
2490
2491         /* Reserve room for dummy headers, this skb can pass
2492            through good chunk of routing engine.
2493          */
2494         skb_reset_mac_header(skb);
2495         skb_reset_network_header(skb);
2496
2497         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2498         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2499         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2500
2501         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2502         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2503         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2504         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2505
2506         memset(&fl4, 0, sizeof(fl4));
2507         fl4.daddr = dst;
2508         fl4.saddr = src;
2509         fl4.flowi4_tos = rtm->rtm_tos;
2510         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2511         fl4.flowi4_mark = mark;
2512
2513         if (netif_index_is_l3_master(net, fl4.flowi4_oif))
2514                 fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
2515
2516         if (iif) {
2517                 struct net_device *dev;
2518
2519                 dev = __dev_get_by_index(net, iif);
2520                 if (!dev) {
2521                         err = -ENODEV;
2522                         goto errout_free;
2523                 }
2524
2525                 skb->protocol   = htons(ETH_P_IP);
2526                 skb->dev        = dev;
2527                 skb->mark       = mark;
2528                 local_bh_disable();
2529                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2530                 local_bh_enable();
2531
2532                 rt = skb_rtable(skb);
2533                 if (err == 0 && rt->dst.error)
2534                         err = -rt->dst.error;
2535         } else {
2536                 rt = ip_route_output_key(net, &fl4);
2537
2538                 err = 0;
2539                 if (IS_ERR(rt))
2540                         err = PTR_ERR(rt);
2541         }
2542
2543         if (err)
2544                 goto errout_free;
2545
2546         skb_dst_set(skb, &rt->dst);
2547         if (rtm->rtm_flags & RTM_F_NOTIFY)
2548                 rt->rt_flags |= RTCF_NOTIFY;
2549
2550         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2551                 table_id = rt->rt_table_id;
2552
2553         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2554                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2555                            RTM_NEWROUTE, 0, 0);
2556         if (err < 0)
2557                 goto errout_free;
2558
2559         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2560 errout:
2561         return err;
2562
2563 errout_free:
2564         kfree_skb(skb);
2565         goto errout;
2566 }
2567
2568 void ip_rt_multicast_event(struct in_device *in_dev)
2569 {
2570         rt_cache_flush(dev_net(in_dev->dev));
2571 }
2572
2573 #ifdef CONFIG_SYSCTL
2574 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2575 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2576 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2577 static int ip_rt_gc_elasticity __read_mostly    = 8;
2578
2579 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2580                                         void __user *buffer,
2581                                         size_t *lenp, loff_t *ppos)
2582 {
2583         struct net *net = (struct net *)__ctl->extra1;
2584
2585         if (write) {
2586                 rt_cache_flush(net);
2587                 fnhe_genid_bump(net);
2588                 return 0;
2589         }
2590
2591         return -EINVAL;
2592 }
2593
2594 static struct ctl_table ipv4_route_table[] = {
2595         {
2596                 .procname       = "gc_thresh",
2597                 .data           = &ipv4_dst_ops.gc_thresh,
2598                 .maxlen         = sizeof(int),
2599                 .mode           = 0644,
2600                 .proc_handler   = proc_dointvec,
2601         },
2602         {
2603                 .procname       = "max_size",
2604                 .data           = &ip_rt_max_size,
2605                 .maxlen         = sizeof(int),
2606                 .mode           = 0644,
2607                 .proc_handler   = proc_dointvec,
2608         },
2609         {
2610                 /*  Deprecated. Use gc_min_interval_ms */
2611
2612                 .procname       = "gc_min_interval",
2613                 .data           = &ip_rt_gc_min_interval,
2614                 .maxlen         = sizeof(int),
2615                 .mode           = 0644,
2616                 .proc_handler   = proc_dointvec_jiffies,
2617         },
2618         {
2619                 .procname       = "gc_min_interval_ms",
2620                 .data           = &ip_rt_gc_min_interval,
2621                 .maxlen         = sizeof(int),
2622                 .mode           = 0644,
2623                 .proc_handler   = proc_dointvec_ms_jiffies,
2624         },
2625         {
2626                 .procname       = "gc_timeout",
2627                 .data           = &ip_rt_gc_timeout,
2628                 .maxlen         = sizeof(int),
2629                 .mode           = 0644,
2630                 .proc_handler   = proc_dointvec_jiffies,
2631         },
2632         {
2633                 .procname       = "gc_interval",
2634                 .data           = &ip_rt_gc_interval,
2635                 .maxlen         = sizeof(int),
2636                 .mode           = 0644,
2637                 .proc_handler   = proc_dointvec_jiffies,
2638         },
2639         {
2640                 .procname       = "redirect_load",
2641                 .data           = &ip_rt_redirect_load,
2642                 .maxlen         = sizeof(int),
2643                 .mode           = 0644,
2644                 .proc_handler   = proc_dointvec,
2645         },
2646         {
2647                 .procname       = "redirect_number",
2648                 .data           = &ip_rt_redirect_number,
2649                 .maxlen         = sizeof(int),
2650                 .mode           = 0644,
2651                 .proc_handler   = proc_dointvec,
2652         },
2653         {
2654                 .procname       = "redirect_silence",
2655                 .data           = &ip_rt_redirect_silence,
2656                 .maxlen         = sizeof(int),
2657                 .mode           = 0644,
2658                 .proc_handler   = proc_dointvec,
2659         },
2660         {
2661                 .procname       = "error_cost",
2662                 .data           = &ip_rt_error_cost,
2663                 .maxlen         = sizeof(int),
2664                 .mode           = 0644,
2665                 .proc_handler   = proc_dointvec,
2666         },
2667         {
2668                 .procname       = "error_burst",
2669                 .data           = &ip_rt_error_burst,
2670                 .maxlen         = sizeof(int),
2671                 .mode           = 0644,
2672                 .proc_handler   = proc_dointvec,
2673         },
2674         {
2675                 .procname       = "gc_elasticity",
2676                 .data           = &ip_rt_gc_elasticity,
2677                 .maxlen         = sizeof(int),
2678                 .mode           = 0644,
2679                 .proc_handler   = proc_dointvec,
2680         },
2681         {
2682                 .procname       = "mtu_expires",
2683                 .data           = &ip_rt_mtu_expires,
2684                 .maxlen         = sizeof(int),
2685                 .mode           = 0644,
2686                 .proc_handler   = proc_dointvec_jiffies,
2687         },
2688         {
2689                 .procname       = "min_pmtu",
2690                 .data           = &ip_rt_min_pmtu,
2691                 .maxlen         = sizeof(int),
2692                 .mode           = 0644,
2693                 .proc_handler   = proc_dointvec,
2694         },
2695         {
2696                 .procname       = "min_adv_mss",
2697                 .data           = &ip_rt_min_advmss,
2698                 .maxlen         = sizeof(int),
2699                 .mode           = 0644,
2700                 .proc_handler   = proc_dointvec,
2701         },
2702         { }
2703 };
2704
2705 static struct ctl_table ipv4_route_flush_table[] = {
2706         {
2707                 .procname       = "flush",
2708                 .maxlen         = sizeof(int),
2709                 .mode           = 0200,
2710                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2711         },
2712         { },
2713 };
2714
2715 static __net_init int sysctl_route_net_init(struct net *net)
2716 {
2717         struct ctl_table *tbl;
2718
2719         tbl = ipv4_route_flush_table;
2720         if (!net_eq(net, &init_net)) {
2721                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2722                 if (!tbl)
2723                         goto err_dup;
2724
2725                 /* Don't export sysctls to unprivileged users */
2726                 if (net->user_ns != &init_user_ns)
2727                         tbl[0].procname = NULL;
2728         }
2729         tbl[0].extra1 = net;
2730
2731         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2732         if (!net->ipv4.route_hdr)
2733                 goto err_reg;
2734         return 0;
2735
2736 err_reg:
2737         if (tbl != ipv4_route_flush_table)
2738                 kfree(tbl);
2739 err_dup:
2740         return -ENOMEM;
2741 }
2742
2743 static __net_exit void sysctl_route_net_exit(struct net *net)
2744 {
2745         struct ctl_table *tbl;
2746
2747         tbl = net->ipv4.route_hdr->ctl_table_arg;
2748         unregister_net_sysctl_table(net->ipv4.route_hdr);
2749         BUG_ON(tbl == ipv4_route_flush_table);
2750         kfree(tbl);
2751 }
2752
2753 static __net_initdata struct pernet_operations sysctl_route_ops = {
2754         .init = sysctl_route_net_init,
2755         .exit = sysctl_route_net_exit,
2756 };
2757 #endif
2758
2759 static __net_init int rt_genid_init(struct net *net)
2760 {
2761         atomic_set(&net->ipv4.rt_genid, 0);
2762         atomic_set(&net->fnhe_genid, 0);
2763         get_random_bytes(&net->ipv4.dev_addr_genid,
2764                          sizeof(net->ipv4.dev_addr_genid));
2765         return 0;
2766 }
2767
2768 static __net_initdata struct pernet_operations rt_genid_ops = {
2769         .init = rt_genid_init,
2770 };
2771
2772 static int __net_init ipv4_inetpeer_init(struct net *net)
2773 {
2774         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2775
2776         if (!bp)
2777                 return -ENOMEM;
2778         inet_peer_base_init(bp);
2779         net->ipv4.peers = bp;
2780         return 0;
2781 }
2782
2783 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2784 {
2785         struct inet_peer_base *bp = net->ipv4.peers;
2786
2787         net->ipv4.peers = NULL;
2788         inetpeer_invalidate_tree(bp);
2789         kfree(bp);
2790 }
2791
2792 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2793         .init   =       ipv4_inetpeer_init,
2794         .exit   =       ipv4_inetpeer_exit,
2795 };
2796
2797 #ifdef CONFIG_IP_ROUTE_CLASSID
2798 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2799 #endif /* CONFIG_IP_ROUTE_CLASSID */
2800
2801 int __init ip_rt_init(void)
2802 {
2803         int rc = 0;
2804         int cpu;
2805
2806         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2807         if (!ip_idents)
2808                 panic("IP: failed to allocate ip_idents\n");
2809
2810         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2811
2812         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2813         if (!ip_tstamps)
2814                 panic("IP: failed to allocate ip_tstamps\n");
2815
2816         for_each_possible_cpu(cpu) {
2817                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2818
2819                 INIT_LIST_HEAD(&ul->head);
2820                 spin_lock_init(&ul->lock);
2821         }
2822 #ifdef CONFIG_IP_ROUTE_CLASSID
2823         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2824         if (!ip_rt_acct)
2825                 panic("IP: failed to allocate ip_rt_acct\n");
2826 #endif
2827
2828         ipv4_dst_ops.kmem_cachep =
2829                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2830                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2831
2832         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2833
2834         if (dst_entries_init(&ipv4_dst_ops) < 0)
2835                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2836
2837         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2838                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2839
2840         ipv4_dst_ops.gc_thresh = ~0;
2841         ip_rt_max_size = INT_MAX;
2842
2843         devinet_init();
2844         ip_fib_init();
2845
2846         if (ip_rt_proc_init())
2847                 pr_err("Unable to create route proc files\n");
2848 #ifdef CONFIG_XFRM
2849         xfrm_init();
2850         xfrm4_init();
2851 #endif
2852         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2853
2854 #ifdef CONFIG_SYSCTL
2855         register_pernet_subsys(&sysctl_route_ops);
2856 #endif
2857         register_pernet_subsys(&rt_genid_ops);
2858         register_pernet_subsys(&ipv4_inetpeer_ops);
2859         return rc;
2860 }
2861
2862 #ifdef CONFIG_SYSCTL
2863 /*
2864  * We really need to sanitize the damn ipv4 init order, then all
2865  * this nonsense will go away.
2866  */
2867 void __init ip_static_sysctl_init(void)
2868 {
2869         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2870 }
2871 #endif