net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111
 112 #define RT_FL_TOS(oldflp) \
 113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 114
 115 #define IP_MAX_MTU      0xFFF0
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_max_size;
 120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 121 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_gc_elasticity __read_mostly    = 8;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132 static int rt_chain_length_max __read_mostly    = 20;
 133
 134 static struct delayed_work expires_work;
 135 static unsigned long expires_ljiffies;
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static void              ipv4_dst_destroy(struct dst_entry *dst);
 143 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 144                                          struct net_device *dev, int how);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 148 static int rt_garbage_collect(struct dst_ops *ops);
 149
 150
 151 static struct dst_ops ipv4_dst_ops = {
 152         .family =               AF_INET,
 153         .protocol =             cpu_to_be16(ETH_P_IP),
 154         .gc =                   rt_garbage_collect,
 155         .check =                ipv4_dst_check,
 156         .destroy =              ipv4_dst_destroy,
 157         .ifdown =               ipv4_dst_ifdown,
 158         .negative_advice =      ipv4_negative_advice,
 159         .link_failure =         ipv4_link_failure,
 160         .update_pmtu =          ip_rt_update_pmtu,
 161         .local_out =            __ip_local_out,
 162 };
 163
 164 #define ECN_OR_COST(class)      TC_PRIO_##class
 165
 166 const __u8 ip_tos2prio[16] = {
 167         TC_PRIO_BESTEFFORT,
 168         ECN_OR_COST(FILLER),
 169         TC_PRIO_BESTEFFORT,
 170         ECN_OR_COST(BESTEFFORT),
 171         TC_PRIO_BULK,
 172         ECN_OR_COST(BULK),
 173         TC_PRIO_BULK,
 174         ECN_OR_COST(BULK),
 175         TC_PRIO_INTERACTIVE,
 176         ECN_OR_COST(INTERACTIVE),
 177         TC_PRIO_INTERACTIVE,
 178         ECN_OR_COST(INTERACTIVE),
 179         TC_PRIO_INTERACTIVE_BULK,
 180         ECN_OR_COST(INTERACTIVE_BULK),
 181         TC_PRIO_INTERACTIVE_BULK,
 182         ECN_OR_COST(INTERACTIVE_BULK)
 183 };
 184
 185
 186 /*
 187  * Route cache.
 188  */
 189
 190 /* The locking scheme is rather straight forward:
 191  *
 192  * 1) Read-Copy Update protects the buckets of the central route hash.
 193  * 2) Only writers remove entries, and they hold the lock
 194  *    as they look at rtable reference counts.
 195  * 3) Only readers acquire references to rtable entries,
 196  *    they do so with atomic increments and with the
 197  *    lock held.
 198  */
 199
 200 struct rt_hash_bucket {
 201         struct rtable   *chain;
 202 };
 203
 204 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 205         defined(CONFIG_PROVE_LOCKING)
 206 /*
 207  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 208  * The size of this table is a power of two and depends on the number of CPUS.
 209  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 210  */
 211 #ifdef CONFIG_LOCKDEP
 212 # define RT_HASH_LOCK_SZ        256
 213 #else
 214 # if NR_CPUS >= 32
 215 #  define RT_HASH_LOCK_SZ       4096
 216 # elif NR_CPUS >= 16
 217 #  define RT_HASH_LOCK_SZ       2048
 218 # elif NR_CPUS >= 8
 219 #  define RT_HASH_LOCK_SZ       1024
 220 # elif NR_CPUS >= 4
 221 #  define RT_HASH_LOCK_SZ       512
 222 # else
 223 #  define RT_HASH_LOCK_SZ       256
 224 # endif
 225 #endif
 226
 227 static spinlock_t       *rt_hash_locks;
 228 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 229
 230 static __init void rt_hash_lock_init(void)
 231 {
 232         int i;
 233
 234         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 235                         GFP_KERNEL);
 236         if (!rt_hash_locks)
 237                 panic("IP: failed to allocate rt_hash_locks\n");
 238
 239         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 240                 spin_lock_init(&rt_hash_locks[i]);
 241 }
 242 #else
 243 # define rt_hash_lock_addr(slot) NULL
 244
 245 static inline void rt_hash_lock_init(void)
 246 {
 247 }
 248 #endif
 249
 250 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 251 static unsigned                 rt_hash_mask __read_mostly;
 252 static unsigned int             rt_hash_log  __read_mostly;
 253
 254 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 255 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 256
 257 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 258                                    int genid)
 259 {
 260         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 261                             idx, genid)
 262                 & rt_hash_mask;
 263 }
 264
 265 static inline int rt_genid(struct net *net)
 266 {
 267         return atomic_read(&net->ipv4.rt_genid);
 268 }
 269
 270 #ifdef CONFIG_PROC_FS
 271 struct rt_cache_iter_state {
 272         struct seq_net_private p;
 273         int bucket;
 274         int genid;
 275 };
 276
 277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 278 {
 279         struct rt_cache_iter_state *st = seq->private;
 280         struct rtable *r = NULL;
 281
 282         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 283                 if (!rt_hash_table[st->bucket].chain)
 284                         continue;
 285                 rcu_read_lock_bh();
 286                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 287                 while (r) {
 288                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 289                             r->rt_genid == st->genid)
 290                                 return r;
 291                         r = rcu_dereference_bh(r->dst.rt_next);
 292                 }
 293                 rcu_read_unlock_bh();
 294         }
 295         return r;
 296 }
 297
 298 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 299                                           struct rtable *r)
 300 {
 301         struct rt_cache_iter_state *st = seq->private;
 302
 303         r = r->dst.rt_next;
 304         while (!r) {
 305                 rcu_read_unlock_bh();
 306                 do {
 307                         if (--st->bucket < 0)
 308                                 return NULL;
 309                 } while (!rt_hash_table[st->bucket].chain);
 310                 rcu_read_lock_bh();
 311                 r = rt_hash_table[st->bucket].chain;
 312         }
 313         return rcu_dereference_bh(r);
 314 }
 315
 316 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 317                                         struct rtable *r)
 318 {
 319         struct rt_cache_iter_state *st = seq->private;
 320         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 321                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 322                         continue;
 323                 if (r->rt_genid == st->genid)
 324                         break;
 325         }
 326         return r;
 327 }
 328
 329 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 330 {
 331         struct rtable *r = rt_cache_get_first(seq);
 332
 333         if (r)
 334                 while (pos && (r = rt_cache_get_next(seq, r)))
 335                         --pos;
 336         return pos ? NULL : r;
 337 }
 338
 339 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 340 {
 341         struct rt_cache_iter_state *st = seq->private;
 342         if (*pos)
 343                 return rt_cache_get_idx(seq, *pos - 1);
 344         st->genid = rt_genid(seq_file_net(seq));
 345         return SEQ_START_TOKEN;
 346 }
 347
 348 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 349 {
 350         struct rtable *r;
 351
 352         if (v == SEQ_START_TOKEN)
 353                 r = rt_cache_get_first(seq);
 354         else
 355                 r = rt_cache_get_next(seq, v);
 356         ++*pos;
 357         return r;
 358 }
 359
 360 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 361 {
 362         if (v && v != SEQ_START_TOKEN)
 363                 rcu_read_unlock_bh();
 364 }
 365
 366 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 367 {
 368         if (v == SEQ_START_TOKEN)
 369                 seq_printf(seq, "%-127s\n",
 370                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 371                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 372                            "HHUptod\tSpecDst");
 373         else {
 374                 struct rtable *r = v;
 375                 int len;
 376
 377                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 378                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 379                         r->dst.dev ? r->dst.dev->name : "*",
 380                         (__force u32)r->rt_dst,
 381                         (__force u32)r->rt_gateway,
 382                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 383                         r->dst.__use, 0, (__force u32)r->rt_src,
 384                         (dst_metric(&r->dst, RTAX_ADVMSS) ?
 385                              (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
 386                         dst_metric(&r->dst, RTAX_WINDOW),
 387                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 388                               dst_metric(&r->dst, RTAX_RTTVAR)),
 389                         r->fl.fl4_tos,
 390                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
 391                         r->dst.hh ? (r->dst.hh->hh_output ==
 392                                        dev_queue_xmit) : 0,
 393                         r->rt_spec_dst, &len);
 394
 395                 seq_printf(seq, "%*s\n", 127 - len, "");
 396         }
 397         return 0;
 398 }
 399
 400 static const struct seq_operations rt_cache_seq_ops = {
 401         .start  = rt_cache_seq_start,
 402         .next   = rt_cache_seq_next,
 403         .stop   = rt_cache_seq_stop,
 404         .show   = rt_cache_seq_show,
 405 };
 406
 407 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 408 {
 409         return seq_open_net(inode, file, &rt_cache_seq_ops,
 410                         sizeof(struct rt_cache_iter_state));
 411 }
 412
 413 static const struct file_operations rt_cache_seq_fops = {
 414         .owner   = THIS_MODULE,
 415         .open    = rt_cache_seq_open,
 416         .read    = seq_read,
 417         .llseek  = seq_lseek,
 418         .release = seq_release_net,
 419 };
 420
 421
 422 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 423 {
 424         int cpu;
 425
 426         if (*pos == 0)
 427                 return SEQ_START_TOKEN;
 428
 429         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 430                 if (!cpu_possible(cpu))
 431                         continue;
 432                 *pos = cpu+1;
 433                 return &per_cpu(rt_cache_stat, cpu);
 434         }
 435         return NULL;
 436 }
 437
 438 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 439 {
 440         int cpu;
 441
 442         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 443                 if (!cpu_possible(cpu))
 444                         continue;
 445                 *pos = cpu+1;
 446                 return &per_cpu(rt_cache_stat, cpu);
 447         }
 448         return NULL;
 449
 450 }
 451
 452 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 453 {
 454
 455 }
 456
 457 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 458 {
 459         struct rt_cache_stat *st = v;
 460
 461         if (v == SEQ_START_TOKEN) {
 462                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 463                 return 0;
 464         }
 465
 466         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 467                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 468                    dst_entries_get_slow(&ipv4_dst_ops),
 469                    st->in_hit,
 470                    st->in_slow_tot,
 471                    st->in_slow_mc,
 472                    st->in_no_route,
 473                    st->in_brd,
 474                    st->in_martian_dst,
 475                    st->in_martian_src,
 476
 477                    st->out_hit,
 478                    st->out_slow_tot,
 479                    st->out_slow_mc,
 480
 481                    st->gc_total,
 482                    st->gc_ignored,
 483                    st->gc_goal_miss,
 484                    st->gc_dst_overflow,
 485                    st->in_hlist_search,
 486                    st->out_hlist_search
 487                 );
 488         return 0;
 489 }
 490
 491 static const struct seq_operations rt_cpu_seq_ops = {
 492         .start  = rt_cpu_seq_start,
 493         .next   = rt_cpu_seq_next,
 494         .stop   = rt_cpu_seq_stop,
 495         .show   = rt_cpu_seq_show,
 496 };
 497
 498
 499 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 500 {
 501         return seq_open(file, &rt_cpu_seq_ops);
 502 }
 503
 504 static const struct file_operations rt_cpu_seq_fops = {
 505         .owner   = THIS_MODULE,
 506         .open    = rt_cpu_seq_open,
 507         .read    = seq_read,
 508         .llseek  = seq_lseek,
 509         .release = seq_release,
 510 };
 511
 512 #ifdef CONFIG_NET_CLS_ROUTE
 513 static int rt_acct_proc_show(struct seq_file *m, void *v)
 514 {
 515         struct ip_rt_acct *dst, *src;
 516         unsigned int i, j;
 517
 518         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 519         if (!dst)
 520                 return -ENOMEM;
 521
 522         for_each_possible_cpu(i) {
 523                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 524                 for (j = 0; j < 256; j++) {
 525                         dst[j].o_bytes   += src[j].o_bytes;
 526                         dst[j].o_packets += src[j].o_packets;
 527                         dst[j].i_bytes   += src[j].i_bytes;
 528                         dst[j].i_packets += src[j].i_packets;
 529                 }
 530         }
 531
 532         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 533         kfree(dst);
 534         return 0;
 535 }
 536
 537 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 538 {
 539         return single_open(file, rt_acct_proc_show, NULL);
 540 }
 541
 542 static const struct file_operations rt_acct_proc_fops = {
 543         .owner          = THIS_MODULE,
 544         .open           = rt_acct_proc_open,
 545         .read           = seq_read,
 546         .llseek         = seq_lseek,
 547         .release        = single_release,
 548 };
 549 #endif
 550
 551 static int __net_init ip_rt_do_proc_init(struct net *net)
 552 {
 553         struct proc_dir_entry *pde;
 554
 555         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 556                         &rt_cache_seq_fops);
 557         if (!pde)
 558                 goto err1;
 559
 560         pde = proc_create("rt_cache", S_IRUGO,
 561                           net->proc_net_stat, &rt_cpu_seq_fops);
 562         if (!pde)
 563                 goto err2;
 564
 565 #ifdef CONFIG_NET_CLS_ROUTE
 566         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 567         if (!pde)
 568                 goto err3;
 569 #endif
 570         return 0;
 571
 572 #ifdef CONFIG_NET_CLS_ROUTE
 573 err3:
 574         remove_proc_entry("rt_cache", net->proc_net_stat);
 575 #endif
 576 err2:
 577         remove_proc_entry("rt_cache", net->proc_net);
 578 err1:
 579         return -ENOMEM;
 580 }
 581
 582 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 583 {
 584         remove_proc_entry("rt_cache", net->proc_net_stat);
 585         remove_proc_entry("rt_cache", net->proc_net);
 586 #ifdef CONFIG_NET_CLS_ROUTE
 587         remove_proc_entry("rt_acct", net->proc_net);
 588 #endif
 589 }
 590
 591 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 592         .init = ip_rt_do_proc_init,
 593         .exit = ip_rt_do_proc_exit,
 594 };
 595
 596 static int __init ip_rt_proc_init(void)
 597 {
 598         return register_pernet_subsys(&ip_rt_proc_ops);
 599 }
 600
 601 #else
 602 static inline int ip_rt_proc_init(void)
 603 {
 604         return 0;
 605 }
 606 #endif /* CONFIG_PROC_FS */
 607
 608 static inline void rt_free(struct rtable *rt)
 609 {
 610         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 611 }
 612
 613 static inline void rt_drop(struct rtable *rt)
 614 {
 615         ip_rt_put(rt);
 616         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 617 }
 618
 619 static inline int rt_fast_clean(struct rtable *rth)
 620 {
 621         /* Kill broadcast/multicast entries very aggresively, if they
 622            collide in hash table with more useful entries */
 623         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 624                 rth->fl.iif && rth->dst.rt_next;
 625 }
 626
 627 static inline int rt_valuable(struct rtable *rth)
 628 {
 629         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 630                 rth->dst.expires;
 631 }
 632
 633 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 634 {
 635         unsigned long age;
 636         int ret = 0;
 637
 638         if (atomic_read(&rth->dst.__refcnt))
 639                 goto out;
 640
 641         ret = 1;
 642         if (rth->dst.expires &&
 643             time_after_eq(jiffies, rth->dst.expires))
 644                 goto out;
 645
 646         age = jiffies - rth->dst.lastuse;
 647         ret = 0;
 648         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 649             (age <= tmo2 && rt_valuable(rth)))
 650                 goto out;
 651         ret = 1;
 652 out:    return ret;
 653 }
 654
 655 /* Bits of score are:
 656  * 31: very valuable
 657  * 30: not quite useless
 658  * 29..0: usage counter
 659  */
 660 static inline u32 rt_score(struct rtable *rt)
 661 {
 662         u32 score = jiffies - rt->dst.lastuse;
 663
 664         score = ~score & ~(3<<30);
 665
 666         if (rt_valuable(rt))
 667                 score |= (1<<31);
 668
 669         if (!rt->fl.iif ||
 670             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 671                 score |= (1<<30);
 672
 673         return score;
 674 }
 675
 676 static inline bool rt_caching(const struct net *net)
 677 {
 678         return net->ipv4.current_rt_cache_rebuild_count <=
 679                 net->ipv4.sysctl_rt_cache_rebuild_count;
 680 }
 681
 682 static inline bool compare_hash_inputs(const struct flowi *fl1,
 683                                         const struct flowi *fl2)
 684 {
 685         return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
 686                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
 687                 (fl1->iif ^ fl2->iif)) == 0);
 688 }
 689
 690 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 691 {
 692         return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
 693                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
 694                 (fl1->mark ^ fl2->mark) |
 695                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
 696                 (fl1->oif ^ fl2->oif) |
 697                 (fl1->iif ^ fl2->iif)) == 0;
 698 }
 699
 700 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 701 {
 702         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 703 }
 704
 705 static inline int rt_is_expired(struct rtable *rth)
 706 {
 707         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 708 }
 709
 710 /*
 711  * Perform a full scan of hash table and free all entries.
 712  * Can be called by a softirq or a process.
 713  * In the later case, we want to be reschedule if necessary
 714  */
 715 static void rt_do_flush(int process_context)
 716 {
 717         unsigned int i;
 718         struct rtable *rth, *next;
 719         struct rtable * tail;
 720
 721         for (i = 0; i <= rt_hash_mask; i++) {
 722                 if (process_context && need_resched())
 723                         cond_resched();
 724                 rth = rt_hash_table[i].chain;
 725                 if (!rth)
 726                         continue;
 727
 728                 spin_lock_bh(rt_hash_lock_addr(i));
 729 #ifdef CONFIG_NET_NS
 730                 {
 731                 struct rtable ** prev, * p;
 732
 733                 rth = rt_hash_table[i].chain;
 734
 735                 /* defer releasing the head of the list after spin_unlock */
 736                 for (tail = rth; tail; tail = tail->dst.rt_next)
 737                         if (!rt_is_expired(tail))
 738                                 break;
 739                 if (rth != tail)
 740                         rt_hash_table[i].chain = tail;
 741
 742                 /* call rt_free on entries after the tail requiring flush */
 743                 prev = &rt_hash_table[i].chain;
 744                 for (p = *prev; p; p = next) {
 745                         next = p->dst.rt_next;
 746                         if (!rt_is_expired(p)) {
 747                                 prev = &p->dst.rt_next;
 748                         } else {
 749                                 *prev = next;
 750                                 rt_free(p);
 751                         }
 752                 }
 753                 }
 754 #else
 755                 rth = rt_hash_table[i].chain;
 756                 rt_hash_table[i].chain = NULL;
 757                 tail = NULL;
 758 #endif
 759                 spin_unlock_bh(rt_hash_lock_addr(i));
 760
 761                 for (; rth != tail; rth = next) {
 762                         next = rth->dst.rt_next;
 763                         rt_free(rth);
 764                 }
 765         }
 766 }
 767
 768 /*
 769  * While freeing expired entries, we compute average chain length
 770  * and standard deviation, using fixed-point arithmetic.
 771  * This to have an estimation of rt_chain_length_max
 772  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 773  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 774  */
 775
 776 #define FRACT_BITS 3
 777 #define ONE (1UL << FRACT_BITS)
 778
 779 /*
 780  * Given a hash chain and an item in this hash chain,
 781  * find if a previous entry has the same hash_inputs
 782  * (but differs on tos, mark or oif)
 783  * Returns 0 if an alias is found.
 784  * Returns ONE if rth has no alias before itself.
 785  */
 786 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 787 {
 788         const struct rtable *aux = head;
 789
 790         while (aux != rth) {
 791                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 792                         return 0;
 793                 aux = aux->dst.rt_next;
 794         }
 795         return ONE;
 796 }
 797
 798 static void rt_check_expire(void)
 799 {
 800         static unsigned int rover;
 801         unsigned int i = rover, goal;
 802         struct rtable *rth, **rthp;
 803         unsigned long samples = 0;
 804         unsigned long sum = 0, sum2 = 0;
 805         unsigned long delta;
 806         u64 mult;
 807
 808         delta = jiffies - expires_ljiffies;
 809         expires_ljiffies = jiffies;
 810         mult = ((u64)delta) << rt_hash_log;
 811         if (ip_rt_gc_timeout > 1)
 812                 do_div(mult, ip_rt_gc_timeout);
 813         goal = (unsigned int)mult;
 814         if (goal > rt_hash_mask)
 815                 goal = rt_hash_mask + 1;
 816         for (; goal > 0; goal--) {
 817                 unsigned long tmo = ip_rt_gc_timeout;
 818                 unsigned long length;
 819
 820                 i = (i + 1) & rt_hash_mask;
 821                 rthp = &rt_hash_table[i].chain;
 822
 823                 if (need_resched())
 824                         cond_resched();
 825
 826                 samples++;
 827
 828                 if (*rthp == NULL)
 829                         continue;
 830                 length = 0;
 831                 spin_lock_bh(rt_hash_lock_addr(i));
 832                 while ((rth = *rthp) != NULL) {
 833                         prefetch(rth->dst.rt_next);
 834                         if (rt_is_expired(rth)) {
 835                                 *rthp = rth->dst.rt_next;
 836                                 rt_free(rth);
 837                                 continue;
 838                         }
 839                         if (rth->dst.expires) {
 840                                 /* Entry is expired even if it is in use */
 841                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 842 nofree:
 843                                         tmo >>= 1;
 844                                         rthp = &rth->dst.rt_next;
 845                                         /*
 846                                          * We only count entries on
 847                                          * a chain with equal hash inputs once
 848                                          * so that entries for different QOS
 849                                          * levels, and other non-hash input
 850                                          * attributes don't unfairly skew
 851                                          * the length computation
 852                                          */
 853                                         length += has_noalias(rt_hash_table[i].chain, rth);
 854                                         continue;
 855                                 }
 856                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 857                                 goto nofree;
 858
 859                         /* Cleanup aged off entries. */
 860                         *rthp = rth->dst.rt_next;
 861                         rt_free(rth);
 862                 }
 863                 spin_unlock_bh(rt_hash_lock_addr(i));
 864                 sum += length;
 865                 sum2 += length*length;
 866         }
 867         if (samples) {
 868                 unsigned long avg = sum / samples;
 869                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 870                 rt_chain_length_max = max_t(unsigned long,
 871                                         ip_rt_gc_elasticity,
 872                                         (avg + 4*sd) >> FRACT_BITS);
 873         }
 874         rover = i;
 875 }
 876
 877 /*
 878  * rt_worker_func() is run in process context.
 879  * we call rt_check_expire() to scan part of the hash table
 880  */
 881 static void rt_worker_func(struct work_struct *work)
 882 {
 883         rt_check_expire();
 884         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 885 }
 886
 887 /*
 888  * Pertubation of rt_genid by a small quantity [1..256]
 889  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 890  * many times (2^24) without giving recent rt_genid.
 891  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 892  */
 893 static void rt_cache_invalidate(struct net *net)
 894 {
 895         unsigned char shuffle;
 896
 897         get_random_bytes(&shuffle, sizeof(shuffle));
 898         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 899 }
 900
 901 /*
 902  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 903  * delay >= 0 : invalidate & flush cache (can be long)
 904  */
 905 void rt_cache_flush(struct net *net, int delay)
 906 {
 907         rt_cache_invalidate(net);
 908         if (delay >= 0)
 909                 rt_do_flush(!in_softirq());
 910 }
 911
 912 /* Flush previous cache invalidated entries from the cache */
 913 void rt_cache_flush_batch(void)
 914 {
 915         rt_do_flush(!in_softirq());
 916 }
 917
 918 static void rt_emergency_hash_rebuild(struct net *net)
 919 {
 920         if (net_ratelimit())
 921                 printk(KERN_WARNING "Route hash chain too long!\n");
 922         rt_cache_invalidate(net);
 923 }
 924
 925 /*
 926    Short description of GC goals.
 927
 928    We want to build algorithm, which will keep routing cache
 929    at some equilibrium point, when number of aged off entries
 930    is kept approximately equal to newly generated ones.
 931
 932    Current expiration strength is variable "expire".
 933    We try to adjust it dynamically, so that if networking
 934    is idle expires is large enough to keep enough of warm entries,
 935    and when load increases it reduces to limit cache size.
 936  */
 937
 938 static int rt_garbage_collect(struct dst_ops *ops)
 939 {
 940         static unsigned long expire = RT_GC_TIMEOUT;
 941         static unsigned long last_gc;
 942         static int rover;
 943         static int equilibrium;
 944         struct rtable *rth, **rthp;
 945         unsigned long now = jiffies;
 946         int goal;
 947         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 948
 949         /*
 950          * Garbage collection is pretty expensive,
 951          * do not make it too frequently.
 952          */
 953
 954         RT_CACHE_STAT_INC(gc_total);
 955
 956         if (now - last_gc < ip_rt_gc_min_interval &&
 957             entries < ip_rt_max_size) {
 958                 RT_CACHE_STAT_INC(gc_ignored);
 959                 goto out;
 960         }
 961
 962         entries = dst_entries_get_slow(&ipv4_dst_ops);
 963         /* Calculate number of entries, which we want to expire now. */
 964         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 965         if (goal <= 0) {
 966                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 967                         equilibrium = ipv4_dst_ops.gc_thresh;
 968                 goal = entries - equilibrium;
 969                 if (goal > 0) {
 970                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 971                         goal = entries - equilibrium;
 972                 }
 973         } else {
 974                 /* We are in dangerous area. Try to reduce cache really
 975                  * aggressively.
 976                  */
 977                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 978                 equilibrium = entries - goal;
 979         }
 980
 981         if (now - last_gc >= ip_rt_gc_min_interval)
 982                 last_gc = now;
 983
 984         if (goal <= 0) {
 985                 equilibrium += goal;
 986                 goto work_done;
 987         }
 988
 989         do {
 990                 int i, k;
 991
 992                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 993                         unsigned long tmo = expire;
 994
 995                         k = (k + 1) & rt_hash_mask;
 996                         rthp = &rt_hash_table[k].chain;
 997                         spin_lock_bh(rt_hash_lock_addr(k));
 998                         while ((rth = *rthp) != NULL) {
 999                                 if (!rt_is_expired(rth) &&
1000                                         !rt_may_expire(rth, tmo, expire)) {
1001                                         tmo >>= 1;
1002                                         rthp = &rth->dst.rt_next;
1003                                         continue;
1004                                 }
1005                                 *rthp = rth->dst.rt_next;
1006                                 rt_free(rth);
1007                                 goal--;
1008                         }
1009                         spin_unlock_bh(rt_hash_lock_addr(k));
1010                         if (goal <= 0)
1011                                 break;
1012                 }
1013                 rover = k;
1014
1015                 if (goal <= 0)
1016                         goto work_done;
1017
1018                 /* Goal is not achieved. We stop process if:
1019
1020                    - if expire reduced to zero. Otherwise, expire is halfed.
1021                    - if table is not full.
1022                    - if we are called from interrupt.
1023                    - jiffies check is just fallback/debug loop breaker.
1024                      We will not spin here for long time in any case.
1025                  */
1026
1027                 RT_CACHE_STAT_INC(gc_goal_miss);
1028
1029                 if (expire == 0)
1030                         break;
1031
1032                 expire >>= 1;
1033 #if RT_CACHE_DEBUG >= 2
1034                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1035                                 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1036 #endif
1037
1038                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1039                         goto out;
1040         } while (!in_softirq() && time_before_eq(jiffies, now));
1041
1042         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1043                 goto out;
1044         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1045                 goto out;
1046         if (net_ratelimit())
1047                 printk(KERN_WARNING "dst cache overflow\n");
1048         RT_CACHE_STAT_INC(gc_dst_overflow);
1049         return 1;
1050
1051 work_done:
1052         expire += ip_rt_gc_min_interval;
1053         if (expire > ip_rt_gc_timeout ||
1054             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1055             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1056                 expire = ip_rt_gc_timeout;
1057 #if RT_CACHE_DEBUG >= 2
1058         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1059                         dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1060 #endif
1061 out:    return 0;
1062 }
1063
1064 /*
1065  * Returns number of entries in a hash chain that have different hash_inputs
1066  */
1067 static int slow_chain_length(const struct rtable *head)
1068 {
1069         int length = 0;
1070         const struct rtable *rth = head;
1071
1072         while (rth) {
1073                 length += has_noalias(head, rth);
1074                 rth = rth->dst.rt_next;
1075         }
1076         return length >> FRACT_BITS;
1077 }
1078
1079 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1080                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1081 {
1082         struct rtable   *rth, **rthp;
1083         unsigned long   now;
1084         struct rtable *cand, **candp;
1085         u32             min_score;
1086         int             chain_length;
1087         int attempts = !in_softirq();
1088
1089 restart:
1090         chain_length = 0;
1091         min_score = ~(u32)0;
1092         cand = NULL;
1093         candp = NULL;
1094         now = jiffies;
1095
1096         if (!rt_caching(dev_net(rt->dst.dev))) {
1097                 /*
1098                  * If we're not caching, just tell the caller we
1099                  * were successful and don't touch the route.  The
1100                  * caller hold the sole reference to the cache entry, and
1101                  * it will be released when the caller is done with it.
1102                  * If we drop it here, the callers have no way to resolve routes
1103                  * when we're not caching.  Instead, just point *rp at rt, so
1104                  * the caller gets a single use out of the route
1105                  * Note that we do rt_free on this new route entry, so that
1106                  * once its refcount hits zero, we are still able to reap it
1107                  * (Thanks Alexey)
1108                  * Note: To avoid expensive rcu stuff for this uncached dst,
1109                  * we set DST_NOCACHE so that dst_release() can free dst without
1110                  * waiting a grace period.
1111                  */
1112
1113                 rt->dst.flags |= DST_NOCACHE;
1114                 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1115                         int err = arp_bind_neighbour(&rt->dst);
1116                         if (err) {
1117                                 if (net_ratelimit())
1118                                         printk(KERN_WARNING
1119                                             "Neighbour table failure & not caching routes.\n");
1120                                 ip_rt_put(rt);
1121                                 return err;
1122                         }
1123                 }
1124
1125                 goto skip_hashing;
1126         }
1127
1128         rthp = &rt_hash_table[hash].chain;
1129
1130         spin_lock_bh(rt_hash_lock_addr(hash));
1131         while ((rth = *rthp) != NULL) {
1132                 if (rt_is_expired(rth)) {
1133                         *rthp = rth->dst.rt_next;
1134                         rt_free(rth);
1135                         continue;
1136                 }
1137                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1138                         /* Put it first */
1139                         *rthp = rth->dst.rt_next;
1140                         /*
1141                          * Since lookup is lockfree, the deletion
1142                          * must be visible to another weakly ordered CPU before
1143                          * the insertion at the start of the hash chain.
1144                          */
1145                         rcu_assign_pointer(rth->dst.rt_next,
1146                                            rt_hash_table[hash].chain);
1147                         /*
1148                          * Since lookup is lockfree, the update writes
1149                          * must be ordered for consistency on SMP.
1150                          */
1151                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1152
1153                         dst_use(&rth->dst, now);
1154                         spin_unlock_bh(rt_hash_lock_addr(hash));
1155
1156                         rt_drop(rt);
1157                         if (rp)
1158                                 *rp = rth;
1159                         else
1160                                 skb_dst_set(skb, &rth->dst);
1161                         return 0;
1162                 }
1163
1164                 if (!atomic_read(&rth->dst.__refcnt)) {
1165                         u32 score = rt_score(rth);
1166
1167                         if (score <= min_score) {
1168                                 cand = rth;
1169                                 candp = rthp;
1170                                 min_score = score;
1171                         }
1172                 }
1173
1174                 chain_length++;
1175
1176                 rthp = &rth->dst.rt_next;
1177         }
1178
1179         if (cand) {
1180                 /* ip_rt_gc_elasticity used to be average length of chain
1181                  * length, when exceeded gc becomes really aggressive.
1182                  *
1183                  * The second limit is less certain. At the moment it allows
1184                  * only 2 entries per bucket. We will see.
1185                  */
1186                 if (chain_length > ip_rt_gc_elasticity) {
1187                         *candp = cand->dst.rt_next;
1188                         rt_free(cand);
1189                 }
1190         } else {
1191                 if (chain_length > rt_chain_length_max &&
1192                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1193                         struct net *net = dev_net(rt->dst.dev);
1194                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1195                         if (!rt_caching(net)) {
1196                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1197                                         rt->dst.dev->name, num);
1198                         }
1199                         rt_emergency_hash_rebuild(net);
1200                         spin_unlock_bh(rt_hash_lock_addr(hash));
1201
1202                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1203                                         ifindex, rt_genid(net));
1204                         goto restart;
1205                 }
1206         }
1207
1208         /* Try to bind route to arp only if it is output
1209            route or unicast forwarding path.
1210          */
1211         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1212                 int err = arp_bind_neighbour(&rt->dst);
1213                 if (err) {
1214                         spin_unlock_bh(rt_hash_lock_addr(hash));
1215
1216                         if (err != -ENOBUFS) {
1217                                 rt_drop(rt);
1218                                 return err;
1219                         }
1220
1221                         /* Neighbour tables are full and nothing
1222                            can be released. Try to shrink route cache,
1223                            it is most likely it holds some neighbour records.
1224                          */
1225                         if (attempts-- > 0) {
1226                                 int saved_elasticity = ip_rt_gc_elasticity;
1227                                 int saved_int = ip_rt_gc_min_interval;
1228                                 ip_rt_gc_elasticity     = 1;
1229                                 ip_rt_gc_min_interval   = 0;
1230                                 rt_garbage_collect(&ipv4_dst_ops);
1231                                 ip_rt_gc_min_interval   = saved_int;
1232                                 ip_rt_gc_elasticity     = saved_elasticity;
1233                                 goto restart;
1234                         }
1235
1236                         if (net_ratelimit())
1237                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1238                         rt_drop(rt);
1239                         return -ENOBUFS;
1240                 }
1241         }
1242
1243         rt->dst.rt_next = rt_hash_table[hash].chain;
1244
1245 #if RT_CACHE_DEBUG >= 2
1246         if (rt->dst.rt_next) {
1247                 struct rtable *trt;
1248                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1249                        hash, &rt->rt_dst);
1250                 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1251                         printk(" . %pI4", &trt->rt_dst);
1252                 printk("\n");
1253         }
1254 #endif
1255         /*
1256          * Since lookup is lockfree, we must make sure
1257          * previous writes to rt are comitted to memory
1258          * before making rt visible to other CPUS.
1259          */
1260         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1261
1262         spin_unlock_bh(rt_hash_lock_addr(hash));
1263
1264 skip_hashing:
1265         if (rp)
1266                 *rp = rt;
1267         else
1268                 skb_dst_set(skb, &rt->dst);
1269         return 0;
1270 }
1271
1272 void rt_bind_peer(struct rtable *rt, int create)
1273 {
1274         struct inet_peer *peer;
1275
1276         peer = inet_getpeer(rt->rt_dst, create);
1277
1278         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1279                 inet_putpeer(peer);
1280 }
1281
1282 /*
1283  * Peer allocation may fail only in serious out-of-memory conditions.  However
1284  * we still can generate some output.
1285  * Random ID selection looks a bit dangerous because we have no chances to
1286  * select ID being unique in a reasonable period of time.
1287  * But broken packet identifier may be better than no packet at all.
1288  */
1289 static void ip_select_fb_ident(struct iphdr *iph)
1290 {
1291         static DEFINE_SPINLOCK(ip_fb_id_lock);
1292         static u32 ip_fallback_id;
1293         u32 salt;
1294
1295         spin_lock_bh(&ip_fb_id_lock);
1296         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1297         iph->id = htons(salt & 0xFFFF);
1298         ip_fallback_id = salt;
1299         spin_unlock_bh(&ip_fb_id_lock);
1300 }
1301
1302 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1303 {
1304         struct rtable *rt = (struct rtable *) dst;
1305
1306         if (rt) {
1307                 if (rt->peer == NULL)
1308                         rt_bind_peer(rt, 1);
1309
1310                 /* If peer is attached to destination, it is never detached,
1311                    so that we need not to grab a lock to dereference it.
1312                  */
1313                 if (rt->peer) {
1314                         iph->id = htons(inet_getid(rt->peer, more));
1315                         return;
1316                 }
1317         } else
1318                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1319                        __builtin_return_address(0));
1320
1321         ip_select_fb_ident(iph);
1322 }
1323 EXPORT_SYMBOL(__ip_select_ident);
1324
1325 static void rt_del(unsigned hash, struct rtable *rt)
1326 {
1327         struct rtable **rthp, *aux;
1328
1329         rthp = &rt_hash_table[hash].chain;
1330         spin_lock_bh(rt_hash_lock_addr(hash));
1331         ip_rt_put(rt);
1332         while ((aux = *rthp) != NULL) {
1333                 if (aux == rt || rt_is_expired(aux)) {
1334                         *rthp = aux->dst.rt_next;
1335                         rt_free(aux);
1336                         continue;
1337                 }
1338                 rthp = &aux->dst.rt_next;
1339         }
1340         spin_unlock_bh(rt_hash_lock_addr(hash));
1341 }
1342
1343 /* called in rcu_read_lock() section */
1344 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1345                     __be32 saddr, struct net_device *dev)
1346 {
1347         int i, k;
1348         struct in_device *in_dev = __in_dev_get_rcu(dev);
1349         struct rtable *rth, **rthp;
1350         __be32  skeys[2] = { saddr, 0 };
1351         int  ikeys[2] = { dev->ifindex, 0 };
1352         struct netevent_redirect netevent;
1353         struct net *net;
1354
1355         if (!in_dev)
1356                 return;
1357
1358         net = dev_net(dev);
1359         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1360             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1361             ipv4_is_zeronet(new_gw))
1362                 goto reject_redirect;
1363
1364         if (!rt_caching(net))
1365                 goto reject_redirect;
1366
1367         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1368                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1369                         goto reject_redirect;
1370                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1371                         goto reject_redirect;
1372         } else {
1373                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1374                         goto reject_redirect;
1375         }
1376
1377         for (i = 0; i < 2; i++) {
1378                 for (k = 0; k < 2; k++) {
1379                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1380                                                 rt_genid(net));
1381
1382                         rthp=&rt_hash_table[hash].chain;
1383
1384                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1385                                 struct rtable *rt;
1386
1387                                 if (rth->fl.fl4_dst != daddr ||
1388                                     rth->fl.fl4_src != skeys[i] ||
1389                                     rth->fl.oif != ikeys[k] ||
1390                                     rth->fl.iif != 0 ||
1391                                     rt_is_expired(rth) ||
1392                                     !net_eq(dev_net(rth->dst.dev), net)) {
1393                                         rthp = &rth->dst.rt_next;
1394                                         continue;
1395                                 }
1396
1397                                 if (rth->rt_dst != daddr ||
1398                                     rth->rt_src != saddr ||
1399                                     rth->dst.error ||
1400                                     rth->rt_gateway != old_gw ||
1401                                     rth->dst.dev != dev)
1402                                         break;
1403
1404                                 dst_hold(&rth->dst);
1405
1406                                 rt = dst_alloc(&ipv4_dst_ops);
1407                                 if (rt == NULL) {
1408                                         ip_rt_put(rth);
1409                                         return;
1410                                 }
1411
1412                                 /* Copy all the information. */
1413                                 *rt = *rth;
1414                                 rt->dst.__use           = 1;
1415                                 atomic_set(&rt->dst.__refcnt, 1);
1416                                 rt->dst.child           = NULL;
1417                                 if (rt->dst.dev)
1418                                         dev_hold(rt->dst.dev);
1419                                 if (rt->idev)
1420                                         in_dev_hold(rt->idev);
1421                                 rt->dst.obsolete        = -1;
1422                                 rt->dst.lastuse = jiffies;
1423                                 rt->dst.path            = &rt->dst;
1424                                 rt->dst.neighbour       = NULL;
1425                                 rt->dst.hh              = NULL;
1426 #ifdef CONFIG_XFRM
1427                                 rt->dst.xfrm            = NULL;
1428 #endif
1429                                 rt->rt_genid            = rt_genid(net);
1430                                 rt->rt_flags            |= RTCF_REDIRECTED;
1431
1432                                 /* Gateway is different ... */
1433                                 rt->rt_gateway          = new_gw;
1434
1435                                 /* Redirect received -> path was valid */
1436                                 dst_confirm(&rth->dst);
1437
1438                                 if (rt->peer)
1439                                         atomic_inc(&rt->peer->refcnt);
1440
1441                                 if (arp_bind_neighbour(&rt->dst) ||
1442                                     !(rt->dst.neighbour->nud_state &
1443                                             NUD_VALID)) {
1444                                         if (rt->dst.neighbour)
1445                                                 neigh_event_send(rt->dst.neighbour, NULL);
1446                                         ip_rt_put(rth);
1447                                         rt_drop(rt);
1448                                         goto do_next;
1449                                 }
1450
1451                                 netevent.old = &rth->dst;
1452                                 netevent.new = &rt->dst;
1453                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1454                                                         &netevent);
1455
1456                                 rt_del(hash, rth);
1457                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1458                                         ip_rt_put(rt);
1459                                 goto do_next;
1460                         }
1461                 do_next:
1462                         ;
1463                 }
1464         }
1465         return;
1466
1467 reject_redirect:
1468 #ifdef CONFIG_IP_ROUTE_VERBOSE
1469         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1470                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1471                         "  Advised path = %pI4 -> %pI4\n",
1472                        &old_gw, dev->name, &new_gw,
1473                        &saddr, &daddr);
1474 #endif
1475         ;
1476 }
1477
1478 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1479 {
1480         struct rtable *rt = (struct rtable *)dst;
1481         struct dst_entry *ret = dst;
1482
1483         if (rt) {
1484                 if (dst->obsolete > 0) {
1485                         ip_rt_put(rt);
1486                         ret = NULL;
1487                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1488                            (rt->dst.expires &&
1489                             time_after_eq(jiffies, rt->dst.expires))) {
1490                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1491                                                 rt->fl.oif,
1492                                                 rt_genid(dev_net(dst->dev)));
1493 #if RT_CACHE_DEBUG >= 1
1494                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1495                                 &rt->rt_dst, rt->fl.fl4_tos);
1496 #endif
1497                         rt_del(hash, rt);
1498                         ret = NULL;
1499                 }
1500         }
1501         return ret;
1502 }
1503
1504 /*
1505  * Algorithm:
1506  *      1. The first ip_rt_redirect_number redirects are sent
1507  *         with exponential backoff, then we stop sending them at all,
1508  *         assuming that the host ignores our redirects.
1509  *      2. If we did not see packets requiring redirects
1510  *         during ip_rt_redirect_silence, we assume that the host
1511  *         forgot redirected route and start to send redirects again.
1512  *
1513  * This algorithm is much cheaper and more intelligent than dumb load limiting
1514  * in icmp.c.
1515  *
1516  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1517  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1518  */
1519
1520 void ip_rt_send_redirect(struct sk_buff *skb)
1521 {
1522         struct rtable *rt = skb_rtable(skb);
1523         struct in_device *in_dev;
1524         int log_martians;
1525
1526         rcu_read_lock();
1527         in_dev = __in_dev_get_rcu(rt->dst.dev);
1528         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1529                 rcu_read_unlock();
1530                 return;
1531         }
1532         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1533         rcu_read_unlock();
1534
1535         /* No redirected packets during ip_rt_redirect_silence;
1536          * reset the algorithm.
1537          */
1538         if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1539                 rt->dst.rate_tokens = 0;
1540
1541         /* Too many ignored redirects; do not send anything
1542          * set dst.rate_last to the last seen redirected packet.
1543          */
1544         if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1545                 rt->dst.rate_last = jiffies;
1546                 return;
1547         }
1548
1549         /* Check for load limit; set rate_last to the latest sent
1550          * redirect.
1551          */
1552         if (rt->dst.rate_tokens == 0 ||
1553             time_after(jiffies,
1554                        (rt->dst.rate_last +
1555                         (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1556                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1557                 rt->dst.rate_last = jiffies;
1558                 ++rt->dst.rate_tokens;
1559 #ifdef CONFIG_IP_ROUTE_VERBOSE
1560                 if (log_martians &&
1561                     rt->dst.rate_tokens == ip_rt_redirect_number &&
1562                     net_ratelimit())
1563                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1564                                 &rt->rt_src, rt->rt_iif,
1565                                 &rt->rt_dst, &rt->rt_gateway);
1566 #endif
1567         }
1568 }
1569
1570 static int ip_error(struct sk_buff *skb)
1571 {
1572         struct rtable *rt = skb_rtable(skb);
1573         unsigned long now;
1574         int code;
1575
1576         switch (rt->dst.error) {
1577                 case EINVAL:
1578                 default:
1579                         goto out;
1580                 case EHOSTUNREACH:
1581                         code = ICMP_HOST_UNREACH;
1582                         break;
1583                 case ENETUNREACH:
1584                         code = ICMP_NET_UNREACH;
1585                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1586                                         IPSTATS_MIB_INNOROUTES);
1587                         break;
1588                 case EACCES:
1589                         code = ICMP_PKT_FILTERED;
1590                         break;
1591         }
1592
1593         now = jiffies;
1594         rt->dst.rate_tokens += now - rt->dst.rate_last;
1595         if (rt->dst.rate_tokens > ip_rt_error_burst)
1596                 rt->dst.rate_tokens = ip_rt_error_burst;
1597         rt->dst.rate_last = now;
1598         if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1599                 rt->dst.rate_tokens -= ip_rt_error_cost;
1600                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1601         }
1602
1603 out:    kfree_skb(skb);
1604         return 0;
1605 }
1606
1607 /*
1608  *      The last two values are not from the RFC but
1609  *      are needed for AMPRnet AX.25 paths.
1610  */
1611
1612 static const unsigned short mtu_plateau[] =
1613 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1614
1615 static inline unsigned short guess_mtu(unsigned short old_mtu)
1616 {
1617         int i;
1618
1619         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1620                 if (old_mtu > mtu_plateau[i])
1621                         return mtu_plateau[i];
1622         return 68;
1623 }
1624
1625 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1626                                  unsigned short new_mtu,
1627                                  struct net_device *dev)
1628 {
1629         int i, k;
1630         unsigned short old_mtu = ntohs(iph->tot_len);
1631         struct rtable *rth;
1632         int  ikeys[2] = { dev->ifindex, 0 };
1633         __be32  skeys[2] = { iph->saddr, 0, };
1634         __be32  daddr = iph->daddr;
1635         unsigned short est_mtu = 0;
1636
1637         for (k = 0; k < 2; k++) {
1638                 for (i = 0; i < 2; i++) {
1639                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1640                                                 rt_genid(net));
1641
1642                         rcu_read_lock();
1643                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1644                              rth = rcu_dereference(rth->dst.rt_next)) {
1645                                 unsigned short mtu = new_mtu;
1646
1647                                 if (rth->fl.fl4_dst != daddr ||
1648                                     rth->fl.fl4_src != skeys[i] ||
1649                                     rth->rt_dst != daddr ||
1650                                     rth->rt_src != iph->saddr ||
1651                                     rth->fl.oif != ikeys[k] ||
1652                                     rth->fl.iif != 0 ||
1653                                     dst_metric_locked(&rth->dst, RTAX_MTU) ||
1654                                     !net_eq(dev_net(rth->dst.dev), net) ||
1655                                     rt_is_expired(rth))
1656                                         continue;
1657
1658                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1659
1660                                         /* BSD 4.2 compatibility hack :-( */
1661                                         if (mtu == 0 &&
1662                                             old_mtu >= dst_mtu(&rth->dst) &&
1663                                             old_mtu >= 68 + (iph->ihl << 2))
1664                                                 old_mtu -= iph->ihl << 2;
1665
1666                                         mtu = guess_mtu(old_mtu);
1667                                 }
1668                                 if (mtu <= dst_mtu(&rth->dst)) {
1669                                         if (mtu < dst_mtu(&rth->dst)) {
1670                                                 dst_confirm(&rth->dst);
1671                                                 if (mtu < ip_rt_min_pmtu) {
1672                                                         mtu = ip_rt_min_pmtu;
1673                                                         rth->dst.metrics[RTAX_LOCK-1] |=
1674                                                                 (1 << RTAX_MTU);
1675                                                 }
1676                                                 rth->dst.metrics[RTAX_MTU-1] = mtu;
1677                                                 dst_set_expires(&rth->dst,
1678                                                         ip_rt_mtu_expires);
1679                                         }
1680                                         est_mtu = mtu;
1681                                 }
1682                         }
1683                         rcu_read_unlock();
1684                 }
1685         }
1686         return est_mtu ? : new_mtu;
1687 }
1688
1689 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1690 {
1691         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1692             !(dst_metric_locked(dst, RTAX_MTU))) {
1693                 if (mtu < ip_rt_min_pmtu) {
1694                         mtu = ip_rt_min_pmtu;
1695                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1696                 }
1697                 dst->metrics[RTAX_MTU-1] = mtu;
1698                 dst_set_expires(dst, ip_rt_mtu_expires);
1699                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1700         }
1701 }
1702
1703 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1704 {
1705         if (rt_is_expired((struct rtable *)dst))
1706                 return NULL;
1707         return dst;
1708 }
1709
1710 static void ipv4_dst_destroy(struct dst_entry *dst)
1711 {
1712         struct rtable *rt = (struct rtable *) dst;
1713         struct inet_peer *peer = rt->peer;
1714         struct in_device *idev = rt->idev;
1715
1716         if (peer) {
1717                 rt->peer = NULL;
1718                 inet_putpeer(peer);
1719         }
1720
1721         if (idev) {
1722                 rt->idev = NULL;
1723                 in_dev_put(idev);
1724         }
1725 }
1726
1727 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1728                             int how)
1729 {
1730         struct rtable *rt = (struct rtable *) dst;
1731         struct in_device *idev = rt->idev;
1732         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1733                 struct in_device *loopback_idev =
1734                         in_dev_get(dev_net(dev)->loopback_dev);
1735                 if (loopback_idev) {
1736                         rt->idev = loopback_idev;
1737                         in_dev_put(idev);
1738                 }
1739         }
1740 }
1741
1742 static void ipv4_link_failure(struct sk_buff *skb)
1743 {
1744         struct rtable *rt;
1745
1746         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1747
1748         rt = skb_rtable(skb);
1749         if (rt)
1750                 dst_set_expires(&rt->dst, 0);
1751 }
1752
1753 static int ip_rt_bug(struct sk_buff *skb)
1754 {
1755         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1756                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1757                 skb->dev ? skb->dev->name : "?");
1758         kfree_skb(skb);
1759         return 0;
1760 }
1761
1762 /*
1763    We do not cache source address of outgoing interface,
1764    because it is used only by IP RR, TS and SRR options,
1765    so that it out of fast path.
1766
1767    BTW remember: "addr" is allowed to be not aligned
1768    in IP options!
1769  */
1770
1771 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1772 {
1773         __be32 src;
1774         struct fib_result res;
1775
1776         if (rt->fl.iif == 0)
1777                 src = rt->rt_src;
1778         else {
1779                 rcu_read_lock();
1780                 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1781                         src = FIB_RES_PREFSRC(res);
1782                 else
1783                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1784                                         RT_SCOPE_UNIVERSE);
1785                 rcu_read_unlock();
1786         }
1787         memcpy(addr, &src, 4);
1788 }
1789
1790 #ifdef CONFIG_NET_CLS_ROUTE
1791 static void set_class_tag(struct rtable *rt, u32 tag)
1792 {
1793         if (!(rt->dst.tclassid & 0xFFFF))
1794                 rt->dst.tclassid |= tag & 0xFFFF;
1795         if (!(rt->dst.tclassid & 0xFFFF0000))
1796                 rt->dst.tclassid |= tag & 0xFFFF0000;
1797 }
1798 #endif
1799
1800 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1801 {
1802         struct fib_info *fi = res->fi;
1803
1804         if (fi) {
1805                 if (FIB_RES_GW(*res) &&
1806                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1807                         rt->rt_gateway = FIB_RES_GW(*res);
1808                 memcpy(rt->dst.metrics, fi->fib_metrics,
1809                        sizeof(rt->dst.metrics));
1810                 if (fi->fib_mtu == 0) {
1811                         rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1812                         if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1813                             rt->rt_gateway != rt->rt_dst &&
1814                             rt->dst.dev->mtu > 576)
1815                                 rt->dst.metrics[RTAX_MTU-1] = 576;
1816                 }
1817 #ifdef CONFIG_NET_CLS_ROUTE
1818                 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1819 #endif
1820         } else
1821                 rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
1822
1823         if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1824                 rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1825         if (dst_mtu(&rt->dst) > IP_MAX_MTU)
1826                 rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1827         if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
1828                 rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1829                                        ip_rt_min_advmss);
1830         if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1831                 rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1832
1833 #ifdef CONFIG_NET_CLS_ROUTE
1834 #ifdef CONFIG_IP_MULTIPLE_TABLES
1835         set_class_tag(rt, fib_rules_tclass(res));
1836 #endif
1837         set_class_tag(rt, itag);
1838 #endif
1839         rt->rt_type = res->type;
1840 }
1841
1842 /* called in rcu_read_lock() section */
1843 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1844                                 u8 tos, struct net_device *dev, int our)
1845 {
1846         unsigned int hash;
1847         struct rtable *rth;
1848         __be32 spec_dst;
1849         struct in_device *in_dev = __in_dev_get_rcu(dev);
1850         u32 itag = 0;
1851         int err;
1852
1853         /* Primary sanity checks. */
1854
1855         if (in_dev == NULL)
1856                 return -EINVAL;
1857
1858         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1859             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1860                 goto e_inval;
1861
1862         if (ipv4_is_zeronet(saddr)) {
1863                 if (!ipv4_is_local_multicast(daddr))
1864                         goto e_inval;
1865                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1866         } else {
1867                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1868                                           &itag, 0);
1869                 if (err < 0)
1870                         goto e_err;
1871         }
1872         rth = dst_alloc(&ipv4_dst_ops);
1873         if (!rth)
1874                 goto e_nobufs;
1875
1876         rth->dst.output = ip_rt_bug;
1877         rth->dst.obsolete = -1;
1878
1879         atomic_set(&rth->dst.__refcnt, 1);
1880         rth->dst.flags= DST_HOST;
1881         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1882                 rth->dst.flags |= DST_NOPOLICY;
1883         rth->fl.fl4_dst = daddr;
1884         rth->rt_dst     = daddr;
1885         rth->fl.fl4_tos = tos;
1886         rth->fl.mark    = skb->mark;
1887         rth->fl.fl4_src = saddr;
1888         rth->rt_src     = saddr;
1889 #ifdef CONFIG_NET_CLS_ROUTE
1890         rth->dst.tclassid = itag;
1891 #endif
1892         rth->rt_iif     =
1893         rth->fl.iif     = dev->ifindex;
1894         rth->dst.dev    = init_net.loopback_dev;
1895         dev_hold(rth->dst.dev);
1896         rth->idev       = in_dev_get(rth->dst.dev);
1897         rth->fl.oif     = 0;
1898         rth->rt_gateway = daddr;
1899         rth->rt_spec_dst= spec_dst;
1900         rth->rt_genid   = rt_genid(dev_net(dev));
1901         rth->rt_flags   = RTCF_MULTICAST;
1902         rth->rt_type    = RTN_MULTICAST;
1903         if (our) {
1904                 rth->dst.input= ip_local_deliver;
1905                 rth->rt_flags |= RTCF_LOCAL;
1906         }
1907
1908 #ifdef CONFIG_IP_MROUTE
1909         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1910                 rth->dst.input = ip_mr_input;
1911 #endif
1912         RT_CACHE_STAT_INC(in_slow_mc);
1913
1914         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1915         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1916
1917 e_nobufs:
1918         return -ENOBUFS;
1919 e_inval:
1920         return -EINVAL;
1921 e_err:
1922         return err;
1923 }
1924
1925
1926 static void ip_handle_martian_source(struct net_device *dev,
1927                                      struct in_device *in_dev,
1928                                      struct sk_buff *skb,
1929                                      __be32 daddr,
1930                                      __be32 saddr)
1931 {
1932         RT_CACHE_STAT_INC(in_martian_src);
1933 #ifdef CONFIG_IP_ROUTE_VERBOSE
1934         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1935                 /*
1936                  *      RFC1812 recommendation, if source is martian,
1937                  *      the only hint is MAC header.
1938                  */
1939                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1940                         &daddr, &saddr, dev->name);
1941                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1942                         int i;
1943                         const unsigned char *p = skb_mac_header(skb);
1944                         printk(KERN_WARNING "ll header: ");
1945                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1946                                 printk("%02x", *p);
1947                                 if (i < (dev->hard_header_len - 1))
1948                                         printk(":");
1949                         }
1950                         printk("\n");
1951                 }
1952         }
1953 #endif
1954 }
1955
1956 /* called in rcu_read_lock() section */
1957 static int __mkroute_input(struct sk_buff *skb,
1958                            struct fib_result *res,
1959                            struct in_device *in_dev,
1960                            __be32 daddr, __be32 saddr, u32 tos,
1961                            struct rtable **result)
1962 {
1963         struct rtable *rth;
1964         int err;
1965         struct in_device *out_dev;
1966         unsigned int flags = 0;
1967         __be32 spec_dst;
1968         u32 itag;
1969
1970         /* get a working reference to the output device */
1971         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1972         if (out_dev == NULL) {
1973                 if (net_ratelimit())
1974                         printk(KERN_CRIT "Bug in ip_route_input" \
1975                                "_slow(). Please, report\n");
1976                 return -EINVAL;
1977         }
1978
1979
1980         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1981                                   in_dev->dev, &spec_dst, &itag, skb->mark);
1982         if (err < 0) {
1983                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1984                                          saddr);
1985
1986                 goto cleanup;
1987         }
1988
1989         if (err)
1990                 flags |= RTCF_DIRECTSRC;
1991
1992         if (out_dev == in_dev && err &&
1993             (IN_DEV_SHARED_MEDIA(out_dev) ||
1994              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1995                 flags |= RTCF_DOREDIRECT;
1996
1997         if (skb->protocol != htons(ETH_P_IP)) {
1998                 /* Not IP (i.e. ARP). Do not create route, if it is
1999                  * invalid for proxy arp. DNAT routes are always valid.
2000                  *
2001                  * Proxy arp feature have been extended to allow, ARP
2002                  * replies back to the same interface, to support
2003                  * Private VLAN switch technologies. See arp.c.
2004                  */
2005                 if (out_dev == in_dev &&
2006                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2007                         err = -EINVAL;
2008                         goto cleanup;
2009                 }
2010         }
2011
2012
2013         rth = dst_alloc(&ipv4_dst_ops);
2014         if (!rth) {
2015                 err = -ENOBUFS;
2016                 goto cleanup;
2017         }
2018
2019         atomic_set(&rth->dst.__refcnt, 1);
2020         rth->dst.flags= DST_HOST;
2021         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2022                 rth->dst.flags |= DST_NOPOLICY;
2023         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2024                 rth->dst.flags |= DST_NOXFRM;
2025         rth->fl.fl4_dst = daddr;
2026         rth->rt_dst     = daddr;
2027         rth->fl.fl4_tos = tos;
2028         rth->fl.mark    = skb->mark;
2029         rth->fl.fl4_src = saddr;
2030         rth->rt_src     = saddr;
2031         rth->rt_gateway = daddr;
2032         rth->rt_iif     =
2033                 rth->fl.iif     = in_dev->dev->ifindex;
2034         rth->dst.dev    = (out_dev)->dev;
2035         dev_hold(rth->dst.dev);
2036         rth->idev       = in_dev_get(rth->dst.dev);
2037         rth->fl.oif     = 0;
2038         rth->rt_spec_dst= spec_dst;
2039
2040         rth->dst.obsolete = -1;
2041         rth->dst.input = ip_forward;
2042         rth->dst.output = ip_output;
2043         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2044
2045         rt_set_nexthop(rth, res, itag);
2046
2047         rth->rt_flags = flags;
2048
2049         *result = rth;
2050         err = 0;
2051  cleanup:
2052         return err;
2053 }
2054
2055 static int ip_mkroute_input(struct sk_buff *skb,
2056                             struct fib_result *res,
2057                             const struct flowi *fl,
2058                             struct in_device *in_dev,
2059                             __be32 daddr, __be32 saddr, u32 tos)
2060 {
2061         struct rtable* rth = NULL;
2062         int err;
2063         unsigned hash;
2064
2065 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2066         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2067                 fib_select_multipath(fl, res);
2068 #endif
2069
2070         /* create a routing cache entry */
2071         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2072         if (err)
2073                 return err;
2074
2075         /* put it into the cache */
2076         hash = rt_hash(daddr, saddr, fl->iif,
2077                        rt_genid(dev_net(rth->dst.dev)));
2078         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2079 }
2080
2081 /*
2082  *      NOTE. We drop all the packets that has local source
2083  *      addresses, because every properly looped back packet
2084  *      must have correct destination already attached by output routine.
2085  *
2086  *      Such approach solves two big problems:
2087  *      1. Not simplex devices are handled properly.
2088  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2089  *      called with rcu_read_lock()
2090  */
2091
2092 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2093                                u8 tos, struct net_device *dev)
2094 {
2095         struct fib_result res;
2096         struct in_device *in_dev = __in_dev_get_rcu(dev);
2097         struct flowi fl = { .nl_u = { .ip4_u =
2098                                       { .daddr = daddr,
2099                                         .saddr = saddr,
2100                                         .tos = tos,
2101                                         .scope = RT_SCOPE_UNIVERSE,
2102                                       } },
2103                             .mark = skb->mark,
2104                             .iif = dev->ifindex };
2105         unsigned        flags = 0;
2106         u32             itag = 0;
2107         struct rtable * rth;
2108         unsigned        hash;
2109         __be32          spec_dst;
2110         int             err = -EINVAL;
2111         struct net    * net = dev_net(dev);
2112
2113         /* IP on this device is disabled. */
2114
2115         if (!in_dev)
2116                 goto out;
2117
2118         /* Check for the most weird martians, which can be not detected
2119            by fib_lookup.
2120          */
2121
2122         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2123             ipv4_is_loopback(saddr))
2124                 goto martian_source;
2125
2126         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2127                 goto brd_input;
2128
2129         /* Accept zero addresses only to limited broadcast;
2130          * I even do not know to fix it or not. Waiting for complains :-)
2131          */
2132         if (ipv4_is_zeronet(saddr))
2133                 goto martian_source;
2134
2135         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2136                 goto martian_destination;
2137
2138         /*
2139          *      Now we are ready to route packet.
2140          */
2141         err = fib_lookup(net, &fl, &res);
2142         if (err != 0) {
2143                 if (!IN_DEV_FORWARD(in_dev))
2144                         goto e_hostunreach;
2145                 goto no_route;
2146         }
2147
2148         RT_CACHE_STAT_INC(in_slow_tot);
2149
2150         if (res.type == RTN_BROADCAST)
2151                 goto brd_input;
2152
2153         if (res.type == RTN_LOCAL) {
2154                 err = fib_validate_source(saddr, daddr, tos,
2155                                           net->loopback_dev->ifindex,
2156                                           dev, &spec_dst, &itag, skb->mark);
2157                 if (err < 0)
2158                         goto martian_source_keep_err;
2159                 if (err)
2160                         flags |= RTCF_DIRECTSRC;
2161                 spec_dst = daddr;
2162                 goto local_input;
2163         }
2164
2165         if (!IN_DEV_FORWARD(in_dev))
2166                 goto e_hostunreach;
2167         if (res.type != RTN_UNICAST)
2168                 goto martian_destination;
2169
2170         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2171 out:    return err;
2172
2173 brd_input:
2174         if (skb->protocol != htons(ETH_P_IP))
2175                 goto e_inval;
2176
2177         if (ipv4_is_zeronet(saddr))
2178                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2179         else {
2180                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2181                                           &itag, skb->mark);
2182                 if (err < 0)
2183                         goto martian_source_keep_err;
2184                 if (err)
2185                         flags |= RTCF_DIRECTSRC;
2186         }
2187         flags |= RTCF_BROADCAST;
2188         res.type = RTN_BROADCAST;
2189         RT_CACHE_STAT_INC(in_brd);
2190
2191 local_input:
2192         rth = dst_alloc(&ipv4_dst_ops);
2193         if (!rth)
2194                 goto e_nobufs;
2195
2196         rth->dst.output= ip_rt_bug;
2197         rth->dst.obsolete = -1;
2198         rth->rt_genid = rt_genid(net);
2199
2200         atomic_set(&rth->dst.__refcnt, 1);
2201         rth->dst.flags= DST_HOST;
2202         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2203                 rth->dst.flags |= DST_NOPOLICY;
2204         rth->fl.fl4_dst = daddr;
2205         rth->rt_dst     = daddr;
2206         rth->fl.fl4_tos = tos;
2207         rth->fl.mark    = skb->mark;
2208         rth->fl.fl4_src = saddr;
2209         rth->rt_src     = saddr;
2210 #ifdef CONFIG_NET_CLS_ROUTE
2211         rth->dst.tclassid = itag;
2212 #endif
2213         rth->rt_iif     =
2214         rth->fl.iif     = dev->ifindex;
2215         rth->dst.dev    = net->loopback_dev;
2216         dev_hold(rth->dst.dev);
2217         rth->idev       = in_dev_get(rth->dst.dev);
2218         rth->rt_gateway = daddr;
2219         rth->rt_spec_dst= spec_dst;
2220         rth->dst.input= ip_local_deliver;
2221         rth->rt_flags   = flags|RTCF_LOCAL;
2222         if (res.type == RTN_UNREACHABLE) {
2223                 rth->dst.input= ip_error;
2224                 rth->dst.error= -err;
2225                 rth->rt_flags   &= ~RTCF_LOCAL;
2226         }
2227         rth->rt_type    = res.type;
2228         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2229         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2230         goto out;
2231
2232 no_route:
2233         RT_CACHE_STAT_INC(in_no_route);
2234         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2235         res.type = RTN_UNREACHABLE;
2236         if (err == -ESRCH)
2237                 err = -ENETUNREACH;
2238         goto local_input;
2239
2240         /*
2241          *      Do not cache martian addresses: they should be logged (RFC1812)
2242          */
2243 martian_destination:
2244         RT_CACHE_STAT_INC(in_martian_dst);
2245 #ifdef CONFIG_IP_ROUTE_VERBOSE
2246         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2247                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2248                         &daddr, &saddr, dev->name);
2249 #endif
2250
2251 e_hostunreach:
2252         err = -EHOSTUNREACH;
2253         goto out;
2254
2255 e_inval:
2256         err = -EINVAL;
2257         goto out;
2258
2259 e_nobufs:
2260         err = -ENOBUFS;
2261         goto out;
2262
2263 martian_source:
2264         err = -EINVAL;
2265 martian_source_keep_err:
2266         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2267         goto out;
2268 }
2269
2270 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2271                            u8 tos, struct net_device *dev, bool noref)
2272 {
2273         struct rtable * rth;
2274         unsigned        hash;
2275         int iif = dev->ifindex;
2276         struct net *net;
2277         int res;
2278
2279         net = dev_net(dev);
2280
2281         rcu_read_lock();
2282
2283         if (!rt_caching(net))
2284                 goto skip_cache;
2285
2286         tos &= IPTOS_RT_MASK;
2287         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2288
2289         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2290              rth = rcu_dereference(rth->dst.rt_next)) {
2291                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2292                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2293                      (rth->fl.iif ^ iif) |
2294                      rth->fl.oif |
2295                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2296                     rth->fl.mark == skb->mark &&
2297                     net_eq(dev_net(rth->dst.dev), net) &&
2298                     !rt_is_expired(rth)) {
2299                         if (noref) {
2300                                 dst_use_noref(&rth->dst, jiffies);
2301                                 skb_dst_set_noref(skb, &rth->dst);
2302                         } else {
2303                                 dst_use(&rth->dst, jiffies);
2304                                 skb_dst_set(skb, &rth->dst);
2305                         }
2306                         RT_CACHE_STAT_INC(in_hit);
2307                         rcu_read_unlock();
2308                         return 0;
2309                 }
2310                 RT_CACHE_STAT_INC(in_hlist_search);
2311         }
2312
2313 skip_cache:
2314         /* Multicast recognition logic is moved from route cache to here.
2315            The problem was that too many Ethernet cards have broken/missing
2316            hardware multicast filters :-( As result the host on multicasting
2317            network acquires a lot of useless route cache entries, sort of
2318            SDR messages from all the world. Now we try to get rid of them.
2319            Really, provided software IP multicast filter is organized
2320            reasonably (at least, hashed), it does not result in a slowdown
2321            comparing with route cache reject entries.
2322            Note, that multicast routers are not affected, because
2323            route cache entry is created eventually.
2324          */
2325         if (ipv4_is_multicast(daddr)) {
2326                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2327
2328                 if (in_dev) {
2329                         int our = ip_check_mc(in_dev, daddr, saddr,
2330                                               ip_hdr(skb)->protocol);
2331                         if (our
2332 #ifdef CONFIG_IP_MROUTE
2333                                 ||
2334                             (!ipv4_is_local_multicast(daddr) &&
2335                              IN_DEV_MFORWARD(in_dev))
2336 #endif
2337                            ) {
2338                                 int res = ip_route_input_mc(skb, daddr, saddr,
2339                                                             tos, dev, our);
2340                                 rcu_read_unlock();
2341                                 return res;
2342                         }
2343                 }
2344                 rcu_read_unlock();
2345                 return -EINVAL;
2346         }
2347         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2348         rcu_read_unlock();
2349         return res;
2350 }
2351 EXPORT_SYMBOL(ip_route_input_common);
2352
2353 /* called with rcu_read_lock() */
2354 static int __mkroute_output(struct rtable **result,
2355                             struct fib_result *res,
2356                             const struct flowi *fl,
2357                             const struct flowi *oldflp,
2358                             struct net_device *dev_out,
2359                             unsigned flags)
2360 {
2361         struct rtable *rth;
2362         struct in_device *in_dev;
2363         u32 tos = RT_FL_TOS(oldflp);
2364
2365         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2366                 return -EINVAL;
2367
2368         if (ipv4_is_lbcast(fl->fl4_dst))
2369                 res->type = RTN_BROADCAST;
2370         else if (ipv4_is_multicast(fl->fl4_dst))
2371                 res->type = RTN_MULTICAST;
2372         else if (ipv4_is_zeronet(fl->fl4_dst))
2373                 return -EINVAL;
2374
2375         if (dev_out->flags & IFF_LOOPBACK)
2376                 flags |= RTCF_LOCAL;
2377
2378         in_dev = __in_dev_get_rcu(dev_out);
2379         if (!in_dev)
2380                 return -EINVAL;
2381
2382         if (res->type == RTN_BROADCAST) {
2383                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2384                 res->fi = NULL;
2385         } else if (res->type == RTN_MULTICAST) {
2386                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2387                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2388                                  oldflp->proto))
2389                         flags &= ~RTCF_LOCAL;
2390                 /* If multicast route do not exist use
2391                  * default one, but do not gateway in this case.
2392                  * Yes, it is hack.
2393                  */
2394                 if (res->fi && res->prefixlen < 4)
2395                         res->fi = NULL;
2396         }
2397
2398
2399         rth = dst_alloc(&ipv4_dst_ops);
2400         if (!rth)
2401                 return -ENOBUFS;
2402
2403         in_dev_hold(in_dev);
2404         rth->idev = in_dev;
2405
2406         atomic_set(&rth->dst.__refcnt, 1);
2407         rth->dst.flags= DST_HOST;
2408         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2409                 rth->dst.flags |= DST_NOXFRM;
2410         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2411                 rth->dst.flags |= DST_NOPOLICY;
2412
2413         rth->fl.fl4_dst = oldflp->fl4_dst;
2414         rth->fl.fl4_tos = tos;
2415         rth->fl.fl4_src = oldflp->fl4_src;
2416         rth->fl.oif     = oldflp->oif;
2417         rth->fl.mark    = oldflp->mark;
2418         rth->rt_dst     = fl->fl4_dst;
2419         rth->rt_src     = fl->fl4_src;
2420         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2421         /* get references to the devices that are to be hold by the routing
2422            cache entry */
2423         rth->dst.dev    = dev_out;
2424         dev_hold(dev_out);
2425         rth->rt_gateway = fl->fl4_dst;
2426         rth->rt_spec_dst= fl->fl4_src;
2427
2428         rth->dst.output=ip_output;
2429         rth->dst.obsolete = -1;
2430         rth->rt_genid = rt_genid(dev_net(dev_out));
2431
2432         RT_CACHE_STAT_INC(out_slow_tot);
2433
2434         if (flags & RTCF_LOCAL) {
2435                 rth->dst.input = ip_local_deliver;
2436                 rth->rt_spec_dst = fl->fl4_dst;
2437         }
2438         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2439                 rth->rt_spec_dst = fl->fl4_src;
2440                 if (flags & RTCF_LOCAL &&
2441                     !(dev_out->flags & IFF_LOOPBACK)) {
2442                         rth->dst.output = ip_mc_output;
2443                         RT_CACHE_STAT_INC(out_slow_mc);
2444                 }
2445 #ifdef CONFIG_IP_MROUTE
2446                 if (res->type == RTN_MULTICAST) {
2447                         if (IN_DEV_MFORWARD(in_dev) &&
2448                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2449                                 rth->dst.input = ip_mr_input;
2450                                 rth->dst.output = ip_mc_output;
2451                         }
2452                 }
2453 #endif
2454         }
2455
2456         rt_set_nexthop(rth, res, 0);
2457
2458         rth->rt_flags = flags;
2459         *result = rth;
2460         return 0;
2461 }
2462
2463 /* called with rcu_read_lock() */
2464 static int ip_mkroute_output(struct rtable **rp,
2465                              struct fib_result *res,
2466                              const struct flowi *fl,
2467                              const struct flowi *oldflp,
2468                              struct net_device *dev_out,
2469                              unsigned flags)
2470 {
2471         struct rtable *rth = NULL;
2472         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2473         unsigned hash;
2474         if (err == 0) {
2475                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2476                                rt_genid(dev_net(dev_out)));
2477                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2478         }
2479
2480         return err;
2481 }
2482
2483 /*
2484  * Major route resolver routine.
2485  * called with rcu_read_lock();
2486  */
2487
2488 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2489                                 const struct flowi *oldflp)
2490 {
2491         u32 tos = RT_FL_TOS(oldflp);
2492         struct flowi fl = { .nl_u = { .ip4_u =
2493                                       { .daddr = oldflp->fl4_dst,
2494                                         .saddr = oldflp->fl4_src,
2495                                         .tos = tos & IPTOS_RT_MASK,
2496                                         .scope = ((tos & RTO_ONLINK) ?
2497                                                   RT_SCOPE_LINK :
2498                                                   RT_SCOPE_UNIVERSE),
2499                                       } },
2500                             .mark = oldflp->mark,
2501                             .iif = net->loopback_dev->ifindex,
2502                             .oif = oldflp->oif };
2503         struct fib_result res;
2504         unsigned int flags = 0;
2505         struct net_device *dev_out = NULL;
2506         int err;
2507
2508
2509         res.fi          = NULL;
2510 #ifdef CONFIG_IP_MULTIPLE_TABLES
2511         res.r           = NULL;
2512 #endif
2513
2514         if (oldflp->fl4_src) {
2515                 err = -EINVAL;
2516                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2517                     ipv4_is_lbcast(oldflp->fl4_src) ||
2518                     ipv4_is_zeronet(oldflp->fl4_src))
2519                         goto out;
2520
2521                 /* I removed check for oif == dev_out->oif here.
2522                    It was wrong for two reasons:
2523                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2524                       is assigned to multiple interfaces.
2525                    2. Moreover, we are allowed to send packets with saddr
2526                       of another iface. --ANK
2527                  */
2528
2529                 if (oldflp->oif == 0 &&
2530                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2531                      ipv4_is_lbcast(oldflp->fl4_dst))) {
2532                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2533                         dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2534                         if (dev_out == NULL)
2535                                 goto out;
2536
2537                         /* Special hack: user can direct multicasts
2538                            and limited broadcast via necessary interface
2539                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2540                            This hack is not just for fun, it allows
2541                            vic,vat and friends to work.
2542                            They bind socket to loopback, set ttl to zero
2543                            and expect that it will work.
2544                            From the viewpoint of routing cache they are broken,
2545                            because we are not allowed to build multicast path
2546                            with loopback source addr (look, routing cache
2547                            cannot know, that ttl is zero, so that packet
2548                            will not leave this host and route is valid).
2549                            Luckily, this hack is good workaround.
2550                          */
2551
2552                         fl.oif = dev_out->ifindex;
2553                         goto make_route;
2554                 }
2555
2556                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2557                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2558                         if (!__ip_dev_find(net, oldflp->fl4_src, false))
2559                                 goto out;
2560                 }
2561         }
2562
2563
2564         if (oldflp->oif) {
2565                 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2566                 err = -ENODEV;
2567                 if (dev_out == NULL)
2568                         goto out;
2569
2570                 /* RACE: Check return value of inet_select_addr instead. */
2571                 if (rcu_dereference(dev_out->ip_ptr) == NULL)
2572                         goto out;       /* Wrong error code */
2573
2574                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2575                     ipv4_is_lbcast(oldflp->fl4_dst)) {
2576                         if (!fl.fl4_src)
2577                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2578                                                               RT_SCOPE_LINK);
2579                         goto make_route;
2580                 }
2581                 if (!fl.fl4_src) {
2582                         if (ipv4_is_multicast(oldflp->fl4_dst))
2583                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2584                                                               fl.fl4_scope);
2585                         else if (!oldflp->fl4_dst)
2586                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2587                                                               RT_SCOPE_HOST);
2588                 }
2589         }
2590
2591         if (!fl.fl4_dst) {
2592                 fl.fl4_dst = fl.fl4_src;
2593                 if (!fl.fl4_dst)
2594                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2595                 dev_out = net->loopback_dev;
2596                 fl.oif = net->loopback_dev->ifindex;
2597                 res.type = RTN_LOCAL;
2598                 flags |= RTCF_LOCAL;
2599                 goto make_route;
2600         }
2601
2602         if (fib_lookup(net, &fl, &res)) {
2603                 res.fi = NULL;
2604                 if (oldflp->oif) {
2605                         /* Apparently, routing tables are wrong. Assume,
2606                            that the destination is on link.
2607
2608                            WHY? DW.
2609                            Because we are allowed to send to iface
2610                            even if it has NO routes and NO assigned
2611                            addresses. When oif is specified, routing
2612                            tables are looked up with only one purpose:
2613                            to catch if destination is gatewayed, rather than
2614                            direct. Moreover, if MSG_DONTROUTE is set,
2615                            we send packet, ignoring both routing tables
2616                            and ifaddr state. --ANK
2617
2618
2619                            We could make it even if oif is unknown,
2620                            likely IPv6, but we do not.
2621                          */
2622
2623                         if (fl.fl4_src == 0)
2624                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2625                                                               RT_SCOPE_LINK);
2626                         res.type = RTN_UNICAST;
2627                         goto make_route;
2628                 }
2629                 err = -ENETUNREACH;
2630                 goto out;
2631         }
2632
2633         if (res.type == RTN_LOCAL) {
2634                 if (!fl.fl4_src)
2635                         fl.fl4_src = fl.fl4_dst;
2636                 dev_out = net->loopback_dev;
2637                 fl.oif = dev_out->ifindex;
2638                 res.fi = NULL;
2639                 flags |= RTCF_LOCAL;
2640                 goto make_route;
2641         }
2642
2643 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2644         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2645                 fib_select_multipath(&fl, &res);
2646         else
2647 #endif
2648         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2649                 fib_select_default(net, &fl, &res);
2650
2651         if (!fl.fl4_src)
2652                 fl.fl4_src = FIB_RES_PREFSRC(res);
2653
2654         dev_out = FIB_RES_DEV(res);
2655         fl.oif = dev_out->ifindex;
2656
2657
2658 make_route:
2659         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2660
2661 out:    return err;
2662 }
2663
2664 int __ip_route_output_key(struct net *net, struct rtable **rp,
2665                           const struct flowi *flp)
2666 {
2667         unsigned int hash;
2668         int res;
2669         struct rtable *rth;
2670
2671         if (!rt_caching(net))
2672                 goto slow_output;
2673
2674         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2675
2676         rcu_read_lock_bh();
2677         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2678                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2679                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2680                     rth->fl.fl4_src == flp->fl4_src &&
2681                     rth->fl.iif == 0 &&
2682                     rth->fl.oif == flp->oif &&
2683                     rth->fl.mark == flp->mark &&
2684                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2685                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2686                     net_eq(dev_net(rth->dst.dev), net) &&
2687                     !rt_is_expired(rth)) {
2688                         dst_use(&rth->dst, jiffies);
2689                         RT_CACHE_STAT_INC(out_hit);
2690                         rcu_read_unlock_bh();
2691                         *rp = rth;
2692                         return 0;
2693                 }
2694                 RT_CACHE_STAT_INC(out_hlist_search);
2695         }
2696         rcu_read_unlock_bh();
2697
2698 slow_output:
2699         rcu_read_lock();
2700         res = ip_route_output_slow(net, rp, flp);
2701         rcu_read_unlock();
2702         return res;
2703 }
2704 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2705
2706 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2707 {
2708         return NULL;
2709 }
2710
2711 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2712 {
2713 }
2714
2715 static struct dst_ops ipv4_dst_blackhole_ops = {
2716         .family                 =       AF_INET,
2717         .protocol               =       cpu_to_be16(ETH_P_IP),
2718         .destroy                =       ipv4_dst_destroy,
2719         .check                  =       ipv4_blackhole_dst_check,
2720         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2721 };
2722
2723
2724 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2725 {
2726         struct rtable *ort = *rp;
2727         struct rtable *rt = (struct rtable *)
2728                 dst_alloc(&ipv4_dst_blackhole_ops);
2729
2730         if (rt) {
2731                 struct dst_entry *new = &rt->dst;
2732
2733                 atomic_set(&new->__refcnt, 1);
2734                 new->__use = 1;
2735                 new->input = dst_discard;
2736                 new->output = dst_discard;
2737                 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
2738
2739                 new->dev = ort->dst.dev;
2740                 if (new->dev)
2741                         dev_hold(new->dev);
2742
2743                 rt->fl = ort->fl;
2744
2745                 rt->idev = ort->idev;
2746                 if (rt->idev)
2747                         in_dev_hold(rt->idev);
2748                 rt->rt_genid = rt_genid(net);
2749                 rt->rt_flags = ort->rt_flags;
2750                 rt->rt_type = ort->rt_type;
2751                 rt->rt_dst = ort->rt_dst;
2752                 rt->rt_src = ort->rt_src;
2753                 rt->rt_iif = ort->rt_iif;
2754                 rt->rt_gateway = ort->rt_gateway;
2755                 rt->rt_spec_dst = ort->rt_spec_dst;
2756                 rt->peer = ort->peer;
2757                 if (rt->peer)
2758                         atomic_inc(&rt->peer->refcnt);
2759
2760                 dst_free(new);
2761         }
2762
2763         dst_release(&(*rp)->dst);
2764         *rp = rt;
2765         return rt ? 0 : -ENOMEM;
2766 }
2767
2768 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2769                          struct sock *sk, int flags)
2770 {
2771         int err;
2772
2773         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2774                 return err;
2775
2776         if (flp->proto) {
2777                 if (!flp->fl4_src)
2778                         flp->fl4_src = (*rp)->rt_src;
2779                 if (!flp->fl4_dst)
2780                         flp->fl4_dst = (*rp)->rt_dst;
2781                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2782                                     flags ? XFRM_LOOKUP_WAIT : 0);
2783                 if (err == -EREMOTE)
2784                         err = ipv4_dst_blackhole(net, rp, flp);
2785
2786                 return err;
2787         }
2788
2789         return 0;
2790 }
2791 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2792
2793 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2794 {
2795         return ip_route_output_flow(net, rp, flp, NULL, 0);
2796 }
2797 EXPORT_SYMBOL(ip_route_output_key);
2798
2799 static int rt_fill_info(struct net *net,
2800                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2801                         int nowait, unsigned int flags)
2802 {
2803         struct rtable *rt = skb_rtable(skb);
2804         struct rtmsg *r;
2805         struct nlmsghdr *nlh;
2806         long expires;
2807         u32 id = 0, ts = 0, tsage = 0, error;
2808
2809         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2810         if (nlh == NULL)
2811                 return -EMSGSIZE;
2812
2813         r = nlmsg_data(nlh);
2814         r->rtm_family    = AF_INET;
2815         r->rtm_dst_len  = 32;
2816         r->rtm_src_len  = 0;
2817         r->rtm_tos      = rt->fl.fl4_tos;
2818         r->rtm_table    = RT_TABLE_MAIN;
2819         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2820         r->rtm_type     = rt->rt_type;
2821         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2822         r->rtm_protocol = RTPROT_UNSPEC;
2823         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2824         if (rt->rt_flags & RTCF_NOTIFY)
2825                 r->rtm_flags |= RTM_F_NOTIFY;
2826
2827         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2828
2829         if (rt->fl.fl4_src) {
2830                 r->rtm_src_len = 32;
2831                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2832         }
2833         if (rt->dst.dev)
2834                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2835 #ifdef CONFIG_NET_CLS_ROUTE
2836         if (rt->dst.tclassid)
2837                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2838 #endif
2839         if (rt->fl.iif)
2840                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2841         else if (rt->rt_src != rt->fl.fl4_src)
2842                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2843
2844         if (rt->rt_dst != rt->rt_gateway)
2845                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2846
2847         if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2848                 goto nla_put_failure;
2849
2850         if (rt->fl.mark)
2851                 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2852
2853         error = rt->dst.error;
2854         expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2855         if (rt->peer) {
2856                 inet_peer_refcheck(rt->peer);
2857                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2858                 if (rt->peer->tcp_ts_stamp) {
2859                         ts = rt->peer->tcp_ts;
2860                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2861                 }
2862         }
2863
2864         if (rt->fl.iif) {
2865 #ifdef CONFIG_IP_MROUTE
2866                 __be32 dst = rt->rt_dst;
2867
2868                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2869                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2870                         int err = ipmr_get_route(net, skb, r, nowait);
2871                         if (err <= 0) {
2872                                 if (!nowait) {
2873                                         if (err == 0)
2874                                                 return 0;
2875                                         goto nla_put_failure;
2876                                 } else {
2877                                         if (err == -EMSGSIZE)
2878                                                 goto nla_put_failure;
2879                                         error = err;
2880                                 }
2881                         }
2882                 } else
2883 #endif
2884                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2885         }
2886
2887         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2888                                expires, error) < 0)
2889                 goto nla_put_failure;
2890
2891         return nlmsg_end(skb, nlh);
2892
2893 nla_put_failure:
2894         nlmsg_cancel(skb, nlh);
2895         return -EMSGSIZE;
2896 }
2897
2898 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2899 {
2900         struct net *net = sock_net(in_skb->sk);
2901         struct rtmsg *rtm;
2902         struct nlattr *tb[RTA_MAX+1];
2903         struct rtable *rt = NULL;
2904         __be32 dst = 0;
2905         __be32 src = 0;
2906         u32 iif;
2907         int err;
2908         int mark;
2909         struct sk_buff *skb;
2910
2911         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2912         if (err < 0)
2913                 goto errout;
2914
2915         rtm = nlmsg_data(nlh);
2916
2917         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2918         if (skb == NULL) {
2919                 err = -ENOBUFS;
2920                 goto errout;
2921         }
2922
2923         /* Reserve room for dummy headers, this skb can pass
2924            through good chunk of routing engine.
2925          */
2926         skb_reset_mac_header(skb);
2927         skb_reset_network_header(skb);
2928
2929         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2930         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2931         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2932
2933         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2934         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2935         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2936         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2937
2938         if (iif) {
2939                 struct net_device *dev;
2940
2941                 dev = __dev_get_by_index(net, iif);
2942                 if (dev == NULL) {
2943                         err = -ENODEV;
2944                         goto errout_free;
2945                 }
2946
2947                 skb->protocol   = htons(ETH_P_IP);
2948                 skb->dev        = dev;
2949                 skb->mark       = mark;
2950                 local_bh_disable();
2951                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2952                 local_bh_enable();
2953
2954                 rt = skb_rtable(skb);
2955                 if (err == 0 && rt->dst.error)
2956                         err = -rt->dst.error;
2957         } else {
2958                 struct flowi fl = {
2959                         .nl_u = {
2960                                 .ip4_u = {
2961                                         .daddr = dst,
2962                                         .saddr = src,
2963                                         .tos = rtm->rtm_tos,
2964                                 },
2965                         },
2966                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2967                         .mark = mark,
2968                 };
2969                 err = ip_route_output_key(net, &rt, &fl);
2970         }
2971
2972         if (err)
2973                 goto errout_free;
2974
2975         skb_dst_set(skb, &rt->dst);
2976         if (rtm->rtm_flags & RTM_F_NOTIFY)
2977                 rt->rt_flags |= RTCF_NOTIFY;
2978
2979         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2980                            RTM_NEWROUTE, 0, 0);
2981         if (err <= 0)
2982                 goto errout_free;
2983
2984         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2985 errout:
2986         return err;
2987
2988 errout_free:
2989         kfree_skb(skb);
2990         goto errout;
2991 }
2992
2993 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2994 {
2995         struct rtable *rt;
2996         int h, s_h;
2997         int idx, s_idx;
2998         struct net *net;
2999
3000         net = sock_net(skb->sk);
3001
3002         s_h = cb->args[0];
3003         if (s_h < 0)
3004                 s_h = 0;
3005         s_idx = idx = cb->args[1];
3006         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3007                 if (!rt_hash_table[h].chain)
3008                         continue;
3009                 rcu_read_lock_bh();
3010                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3011                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3012                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3013                                 continue;
3014                         if (rt_is_expired(rt))
3015                                 continue;
3016                         skb_dst_set_noref(skb, &rt->dst);
3017                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3018                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3019                                          1, NLM_F_MULTI) <= 0) {
3020                                 skb_dst_drop(skb);
3021                                 rcu_read_unlock_bh();
3022                                 goto done;
3023                         }
3024                         skb_dst_drop(skb);
3025                 }
3026                 rcu_read_unlock_bh();
3027         }
3028
3029 done:
3030         cb->args[0] = h;
3031         cb->args[1] = idx;
3032         return skb->len;
3033 }
3034
3035 void ip_rt_multicast_event(struct in_device *in_dev)
3036 {
3037         rt_cache_flush(dev_net(in_dev->dev), 0);
3038 }
3039
3040 #ifdef CONFIG_SYSCTL
3041 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3042                                         void __user *buffer,
3043                                         size_t *lenp, loff_t *ppos)
3044 {
3045         if (write) {
3046                 int flush_delay;
3047                 ctl_table ctl;
3048                 struct net *net;
3049
3050                 memcpy(&ctl, __ctl, sizeof(ctl));
3051                 ctl.data = &flush_delay;
3052                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3053
3054                 net = (struct net *)__ctl->extra1;
3055                 rt_cache_flush(net, flush_delay);
3056                 return 0;
3057         }
3058
3059         return -EINVAL;
3060 }
3061
3062 static ctl_table ipv4_route_table[] = {
3063         {
3064                 .procname       = "gc_thresh",
3065                 .data           = &ipv4_dst_ops.gc_thresh,
3066                 .maxlen         = sizeof(int),
3067                 .mode           = 0644,
3068                 .proc_handler   = proc_dointvec,
3069         },
3070         {
3071                 .procname       = "max_size",
3072                 .data           = &ip_rt_max_size,
3073                 .maxlen         = sizeof(int),
3074                 .mode           = 0644,
3075                 .proc_handler   = proc_dointvec,
3076         },
3077         {
3078                 /*  Deprecated. Use gc_min_interval_ms */
3079
3080                 .procname       = "gc_min_interval",
3081                 .data           = &ip_rt_gc_min_interval,
3082                 .maxlen         = sizeof(int),
3083                 .mode           = 0644,
3084                 .proc_handler   = proc_dointvec_jiffies,
3085         },
3086         {
3087                 .procname       = "gc_min_interval_ms",
3088                 .data           = &ip_rt_gc_min_interval,
3089                 .maxlen         = sizeof(int),
3090                 .mode           = 0644,
3091                 .proc_handler   = proc_dointvec_ms_jiffies,
3092         },
3093         {
3094                 .procname       = "gc_timeout",
3095                 .data           = &ip_rt_gc_timeout,
3096                 .maxlen         = sizeof(int),
3097                 .mode           = 0644,
3098                 .proc_handler   = proc_dointvec_jiffies,
3099         },
3100         {
3101                 .procname       = "gc_interval",
3102                 .data           = &ip_rt_gc_interval,
3103                 .maxlen         = sizeof(int),
3104                 .mode           = 0644,
3105                 .proc_handler   = proc_dointvec_jiffies,
3106         },
3107         {
3108                 .procname       = "redirect_load",
3109                 .data           = &ip_rt_redirect_load,
3110                 .maxlen         = sizeof(int),
3111                 .mode           = 0644,
3112                 .proc_handler   = proc_dointvec,
3113         },
3114         {
3115                 .procname       = "redirect_number",
3116                 .data           = &ip_rt_redirect_number,
3117                 .maxlen         = sizeof(int),
3118                 .mode           = 0644,
3119                 .proc_handler   = proc_dointvec,
3120         },
3121         {
3122                 .procname       = "redirect_silence",
3123                 .data           = &ip_rt_redirect_silence,
3124                 .maxlen         = sizeof(int),
3125                 .mode           = 0644,
3126                 .proc_handler   = proc_dointvec,
3127         },
3128         {
3129                 .procname       = "error_cost",
3130                 .data           = &ip_rt_error_cost,
3131                 .maxlen         = sizeof(int),
3132                 .mode           = 0644,
3133                 .proc_handler   = proc_dointvec,
3134         },
3135         {
3136                 .procname       = "error_burst",
3137                 .data           = &ip_rt_error_burst,
3138                 .maxlen         = sizeof(int),
3139                 .mode           = 0644,
3140                 .proc_handler   = proc_dointvec,
3141         },
3142         {
3143                 .procname       = "gc_elasticity",
3144                 .data           = &ip_rt_gc_elasticity,
3145                 .maxlen         = sizeof(int),
3146                 .mode           = 0644,
3147                 .proc_handler   = proc_dointvec,
3148         },
3149         {
3150                 .procname       = "mtu_expires",
3151                 .data           = &ip_rt_mtu_expires,
3152                 .maxlen         = sizeof(int),
3153                 .mode           = 0644,
3154                 .proc_handler   = proc_dointvec_jiffies,
3155         },
3156         {
3157                 .procname       = "min_pmtu",
3158                 .data           = &ip_rt_min_pmtu,
3159                 .maxlen         = sizeof(int),
3160                 .mode           = 0644,
3161                 .proc_handler   = proc_dointvec,
3162         },
3163         {
3164                 .procname       = "min_adv_mss",
3165                 .data           = &ip_rt_min_advmss,
3166                 .maxlen         = sizeof(int),
3167                 .mode           = 0644,
3168                 .proc_handler   = proc_dointvec,
3169         },
3170         { }
3171 };
3172
3173 static struct ctl_table empty[1];
3174
3175 static struct ctl_table ipv4_skeleton[] =
3176 {
3177         { .procname = "route",
3178           .mode = 0555, .child = ipv4_route_table},
3179         { .procname = "neigh",
3180           .mode = 0555, .child = empty},
3181         { }
3182 };
3183
3184 static __net_initdata struct ctl_path ipv4_path[] = {
3185         { .procname = "net", },
3186         { .procname = "ipv4", },
3187         { },
3188 };
3189
3190 static struct ctl_table ipv4_route_flush_table[] = {
3191         {
3192                 .procname       = "flush",
3193                 .maxlen         = sizeof(int),
3194                 .mode           = 0200,
3195                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3196         },
3197         { },
3198 };
3199
3200 static __net_initdata struct ctl_path ipv4_route_path[] = {
3201         { .procname = "net", },
3202         { .procname = "ipv4", },
3203         { .procname = "route", },
3204         { },
3205 };
3206
3207 static __net_init int sysctl_route_net_init(struct net *net)
3208 {
3209         struct ctl_table *tbl;
3210
3211         tbl = ipv4_route_flush_table;
3212         if (!net_eq(net, &init_net)) {
3213                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3214                 if (tbl == NULL)
3215                         goto err_dup;
3216         }
3217         tbl[0].extra1 = net;
3218
3219         net->ipv4.route_hdr =
3220                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3221         if (net->ipv4.route_hdr == NULL)
3222                 goto err_reg;
3223         return 0;
3224
3225 err_reg:
3226         if (tbl != ipv4_route_flush_table)
3227                 kfree(tbl);
3228 err_dup:
3229         return -ENOMEM;
3230 }
3231
3232 static __net_exit void sysctl_route_net_exit(struct net *net)
3233 {
3234         struct ctl_table *tbl;
3235
3236         tbl = net->ipv4.route_hdr->ctl_table_arg;
3237         unregister_net_sysctl_table(net->ipv4.route_hdr);
3238         BUG_ON(tbl == ipv4_route_flush_table);
3239         kfree(tbl);
3240 }
3241
3242 static __net_initdata struct pernet_operations sysctl_route_ops = {
3243         .init = sysctl_route_net_init,
3244         .exit = sysctl_route_net_exit,
3245 };
3246 #endif
3247
3248 static __net_init int rt_genid_init(struct net *net)
3249 {
3250         get_random_bytes(&net->ipv4.rt_genid,
3251                          sizeof(net->ipv4.rt_genid));
3252         return 0;
3253 }
3254
3255 static __net_initdata struct pernet_operations rt_genid_ops = {
3256         .init = rt_genid_init,
3257 };
3258
3259
3260 #ifdef CONFIG_NET_CLS_ROUTE
3261 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3262 #endif /* CONFIG_NET_CLS_ROUTE */
3263
3264 static __initdata unsigned long rhash_entries;
3265 static int __init set_rhash_entries(char *str)
3266 {
3267         if (!str)
3268                 return 0;
3269         rhash_entries = simple_strtoul(str, &str, 0);
3270         return 1;
3271 }
3272 __setup("rhash_entries=", set_rhash_entries);
3273
3274 int __init ip_rt_init(void)
3275 {
3276         int rc = 0;
3277
3278 #ifdef CONFIG_NET_CLS_ROUTE
3279         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3280         if (!ip_rt_acct)
3281                 panic("IP: failed to allocate ip_rt_acct\n");
3282 #endif
3283
3284         ipv4_dst_ops.kmem_cachep =
3285                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3286                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3287
3288         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3289
3290         if (dst_entries_init(&ipv4_dst_ops) < 0)
3291                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3292
3293         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3294                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3295
3296         rt_hash_table = (struct rt_hash_bucket *)
3297                 alloc_large_system_hash("IP route cache",
3298                                         sizeof(struct rt_hash_bucket),
3299                                         rhash_entries,
3300                                         (totalram_pages >= 128 * 1024) ?
3301                                         15 : 17,
3302                                         0,
3303                                         &rt_hash_log,
3304                                         &rt_hash_mask,
3305                                         rhash_entries ? 0 : 512 * 1024);
3306         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3307         rt_hash_lock_init();
3308
3309         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3310         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3311
3312         devinet_init();
3313         ip_fib_init();
3314
3315         /* All the timers, started at system startup tend
3316            to synchronize. Perturb it a bit.
3317          */
3318         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3319         expires_ljiffies = jiffies;
3320         schedule_delayed_work(&expires_work,
3321                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3322
3323         if (ip_rt_proc_init())
3324                 printk(KERN_ERR "Unable to create route proc files\n");
3325 #ifdef CONFIG_XFRM
3326         xfrm_init();
3327         xfrm4_init(ip_rt_max_size);
3328 #endif
3329         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3330
3331 #ifdef CONFIG_SYSCTL
3332         register_pernet_subsys(&sysctl_route_ops);
3333 #endif
3334         register_pernet_subsys(&rt_genid_ops);
3335         return rc;
3336 }
3337
3338 #ifdef CONFIG_SYSCTL
3339 /*
3340  * We really need to sanitize the damn ipv4 init order, then all
3341  * this nonsense will go away.
3342  */
3343 void __init ip_static_sysctl_init(void)
3344 {
3345         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3346 }
3347 #endif