net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111
 112 #define RT_FL_TOS(oldflp) \
 113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 114
 115 #define IP_MAX_MTU      0xFFF0
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_max_size;
 120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 121 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_gc_elasticity __read_mostly    = 8;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132 static int rt_chain_length_max __read_mostly    = 20;
 133
 134 /*
 135  *      Interface to generic destination cache.
 136  */
 137
 138 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 139 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 140 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 141 static void              ipv4_dst_destroy(struct dst_entry *dst);
 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143 static void              ipv4_link_failure(struct sk_buff *skb);
 144 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 145 static int rt_garbage_collect(struct dst_ops *ops);
 146
 147 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 148                             int how)
 149 {
 150 }
 151
 152 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 153 {
 154         struct rtable *rt = (struct rtable *) dst;
 155         struct inet_peer *peer;
 156         u32 *p = NULL;
 157
 158         if (!rt->peer)
 159                 rt_bind_peer(rt, 1);
 160
 161         peer = rt->peer;
 162         if (peer) {
 163                 u32 *old_p = __DST_METRICS_PTR(old);
 164                 unsigned long prev, new;
 165
 166                 p = peer->metrics;
 167                 if (inet_metrics_new(peer))
 168                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 169
 170                 new = (unsigned long) p;
 171                 prev = cmpxchg(&dst->_metrics, old, new);
 172
 173                 if (prev != old) {
 174                         p = __DST_METRICS_PTR(prev);
 175                         if (prev & DST_METRICS_READ_ONLY)
 176                                 p = NULL;
 177                 } else {
 178                         if (rt->fi) {
 179                                 fib_info_put(rt->fi);
 180                                 rt->fi = NULL;
 181                         }
 182                 }
 183         }
 184         return p;
 185 }
 186
 187 static struct dst_ops ipv4_dst_ops = {
 188         .family =               AF_INET,
 189         .protocol =             cpu_to_be16(ETH_P_IP),
 190         .gc =                   rt_garbage_collect,
 191         .check =                ipv4_dst_check,
 192         .default_advmss =       ipv4_default_advmss,
 193         .default_mtu =          ipv4_default_mtu,
 194         .cow_metrics =          ipv4_cow_metrics,
 195         .destroy =              ipv4_dst_destroy,
 196         .ifdown =               ipv4_dst_ifdown,
 197         .negative_advice =      ipv4_negative_advice,
 198         .link_failure =         ipv4_link_failure,
 199         .update_pmtu =          ip_rt_update_pmtu,
 200         .local_out =            __ip_local_out,
 201 };
 202
 203 #define ECN_OR_COST(class)      TC_PRIO_##class
 204
 205 const __u8 ip_tos2prio[16] = {
 206         TC_PRIO_BESTEFFORT,
 207         ECN_OR_COST(FILLER),
 208         TC_PRIO_BESTEFFORT,
 209         ECN_OR_COST(BESTEFFORT),
 210         TC_PRIO_BULK,
 211         ECN_OR_COST(BULK),
 212         TC_PRIO_BULK,
 213         ECN_OR_COST(BULK),
 214         TC_PRIO_INTERACTIVE,
 215         ECN_OR_COST(INTERACTIVE),
 216         TC_PRIO_INTERACTIVE,
 217         ECN_OR_COST(INTERACTIVE),
 218         TC_PRIO_INTERACTIVE_BULK,
 219         ECN_OR_COST(INTERACTIVE_BULK),
 220         TC_PRIO_INTERACTIVE_BULK,
 221         ECN_OR_COST(INTERACTIVE_BULK)
 222 };
 223
 224
 225 /*
 226  * Route cache.
 227  */
 228
 229 /* The locking scheme is rather straight forward:
 230  *
 231  * 1) Read-Copy Update protects the buckets of the central route hash.
 232  * 2) Only writers remove entries, and they hold the lock
 233  *    as they look at rtable reference counts.
 234  * 3) Only readers acquire references to rtable entries,
 235  *    they do so with atomic increments and with the
 236  *    lock held.
 237  */
 238
 239 struct rt_hash_bucket {
 240         struct rtable __rcu     *chain;
 241 };
 242
 243 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 244         defined(CONFIG_PROVE_LOCKING)
 245 /*
 246  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 247  * The size of this table is a power of two and depends on the number of CPUS.
 248  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 249  */
 250 #ifdef CONFIG_LOCKDEP
 251 # define RT_HASH_LOCK_SZ        256
 252 #else
 253 # if NR_CPUS >= 32
 254 #  define RT_HASH_LOCK_SZ       4096
 255 # elif NR_CPUS >= 16
 256 #  define RT_HASH_LOCK_SZ       2048
 257 # elif NR_CPUS >= 8
 258 #  define RT_HASH_LOCK_SZ       1024
 259 # elif NR_CPUS >= 4
 260 #  define RT_HASH_LOCK_SZ       512
 261 # else
 262 #  define RT_HASH_LOCK_SZ       256
 263 # endif
 264 #endif
 265
 266 static spinlock_t       *rt_hash_locks;
 267 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 268
 269 static __init void rt_hash_lock_init(void)
 270 {
 271         int i;
 272
 273         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 274                         GFP_KERNEL);
 275         if (!rt_hash_locks)
 276                 panic("IP: failed to allocate rt_hash_locks\n");
 277
 278         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 279                 spin_lock_init(&rt_hash_locks[i]);
 280 }
 281 #else
 282 # define rt_hash_lock_addr(slot) NULL
 283
 284 static inline void rt_hash_lock_init(void)
 285 {
 286 }
 287 #endif
 288
 289 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 290 static unsigned                 rt_hash_mask __read_mostly;
 291 static unsigned int             rt_hash_log  __read_mostly;
 292
 293 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 294 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 295
 296 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 297                                    int genid)
 298 {
 299         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 300                             idx, genid)
 301                 & rt_hash_mask;
 302 }
 303
 304 static inline int rt_genid(struct net *net)
 305 {
 306         return atomic_read(&net->ipv4.rt_genid);
 307 }
 308
 309 #ifdef CONFIG_PROC_FS
 310 struct rt_cache_iter_state {
 311         struct seq_net_private p;
 312         int bucket;
 313         int genid;
 314 };
 315
 316 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 317 {
 318         struct rt_cache_iter_state *st = seq->private;
 319         struct rtable *r = NULL;
 320
 321         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 322                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 323                         continue;
 324                 rcu_read_lock_bh();
 325                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 326                 while (r) {
 327                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 328                             r->rt_genid == st->genid)
 329                                 return r;
 330                         r = rcu_dereference_bh(r->dst.rt_next);
 331                 }
 332                 rcu_read_unlock_bh();
 333         }
 334         return r;
 335 }
 336
 337 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 338                                           struct rtable *r)
 339 {
 340         struct rt_cache_iter_state *st = seq->private;
 341
 342         r = rcu_dereference_bh(r->dst.rt_next);
 343         while (!r) {
 344                 rcu_read_unlock_bh();
 345                 do {
 346                         if (--st->bucket < 0)
 347                                 return NULL;
 348                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 349                 rcu_read_lock_bh();
 350                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 351         }
 352         return r;
 353 }
 354
 355 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 356                                         struct rtable *r)
 357 {
 358         struct rt_cache_iter_state *st = seq->private;
 359         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 360                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 361                         continue;
 362                 if (r->rt_genid == st->genid)
 363                         break;
 364         }
 365         return r;
 366 }
 367
 368 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 369 {
 370         struct rtable *r = rt_cache_get_first(seq);
 371
 372         if (r)
 373                 while (pos && (r = rt_cache_get_next(seq, r)))
 374                         --pos;
 375         return pos ? NULL : r;
 376 }
 377
 378 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 379 {
 380         struct rt_cache_iter_state *st = seq->private;
 381         if (*pos)
 382                 return rt_cache_get_idx(seq, *pos - 1);
 383         st->genid = rt_genid(seq_file_net(seq));
 384         return SEQ_START_TOKEN;
 385 }
 386
 387 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 388 {
 389         struct rtable *r;
 390
 391         if (v == SEQ_START_TOKEN)
 392                 r = rt_cache_get_first(seq);
 393         else
 394                 r = rt_cache_get_next(seq, v);
 395         ++*pos;
 396         return r;
 397 }
 398
 399 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 400 {
 401         if (v && v != SEQ_START_TOKEN)
 402                 rcu_read_unlock_bh();
 403 }
 404
 405 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 406 {
 407         if (v == SEQ_START_TOKEN)
 408                 seq_printf(seq, "%-127s\n",
 409                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 410                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 411                            "HHUptod\tSpecDst");
 412         else {
 413                 struct rtable *r = v;
 414                 int len;
 415
 416                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 417                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 418                         r->dst.dev ? r->dst.dev->name : "*",
 419                         (__force u32)r->rt_dst,
 420                         (__force u32)r->rt_gateway,
 421                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 422                         r->dst.__use, 0, (__force u32)r->rt_src,
 423                         dst_metric_advmss(&r->dst) + 40,
 424                         dst_metric(&r->dst, RTAX_WINDOW),
 425                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 426                               dst_metric(&r->dst, RTAX_RTTVAR)),
 427                         r->fl.fl4_tos,
 428                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
 429                         r->dst.hh ? (r->dst.hh->hh_output ==
 430                                        dev_queue_xmit) : 0,
 431                         r->rt_spec_dst, &len);
 432
 433                 seq_printf(seq, "%*s\n", 127 - len, "");
 434         }
 435         return 0;
 436 }
 437
 438 static const struct seq_operations rt_cache_seq_ops = {
 439         .start  = rt_cache_seq_start,
 440         .next   = rt_cache_seq_next,
 441         .stop   = rt_cache_seq_stop,
 442         .show   = rt_cache_seq_show,
 443 };
 444
 445 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 446 {
 447         return seq_open_net(inode, file, &rt_cache_seq_ops,
 448                         sizeof(struct rt_cache_iter_state));
 449 }
 450
 451 static const struct file_operations rt_cache_seq_fops = {
 452         .owner   = THIS_MODULE,
 453         .open    = rt_cache_seq_open,
 454         .read    = seq_read,
 455         .llseek  = seq_lseek,
 456         .release = seq_release_net,
 457 };
 458
 459
 460 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 461 {
 462         int cpu;
 463
 464         if (*pos == 0)
 465                 return SEQ_START_TOKEN;
 466
 467         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 468                 if (!cpu_possible(cpu))
 469                         continue;
 470                 *pos = cpu+1;
 471                 return &per_cpu(rt_cache_stat, cpu);
 472         }
 473         return NULL;
 474 }
 475
 476 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 477 {
 478         int cpu;
 479
 480         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 481                 if (!cpu_possible(cpu))
 482                         continue;
 483                 *pos = cpu+1;
 484                 return &per_cpu(rt_cache_stat, cpu);
 485         }
 486         return NULL;
 487
 488 }
 489
 490 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 491 {
 492
 493 }
 494
 495 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 496 {
 497         struct rt_cache_stat *st = v;
 498
 499         if (v == SEQ_START_TOKEN) {
 500                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 501                 return 0;
 502         }
 503
 504         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 505                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 506                    dst_entries_get_slow(&ipv4_dst_ops),
 507                    st->in_hit,
 508                    st->in_slow_tot,
 509                    st->in_slow_mc,
 510                    st->in_no_route,
 511                    st->in_brd,
 512                    st->in_martian_dst,
 513                    st->in_martian_src,
 514
 515                    st->out_hit,
 516                    st->out_slow_tot,
 517                    st->out_slow_mc,
 518
 519                    st->gc_total,
 520                    st->gc_ignored,
 521                    st->gc_goal_miss,
 522                    st->gc_dst_overflow,
 523                    st->in_hlist_search,
 524                    st->out_hlist_search
 525                 );
 526         return 0;
 527 }
 528
 529 static const struct seq_operations rt_cpu_seq_ops = {
 530         .start  = rt_cpu_seq_start,
 531         .next   = rt_cpu_seq_next,
 532         .stop   = rt_cpu_seq_stop,
 533         .show   = rt_cpu_seq_show,
 534 };
 535
 536
 537 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 538 {
 539         return seq_open(file, &rt_cpu_seq_ops);
 540 }
 541
 542 static const struct file_operations rt_cpu_seq_fops = {
 543         .owner   = THIS_MODULE,
 544         .open    = rt_cpu_seq_open,
 545         .read    = seq_read,
 546         .llseek  = seq_lseek,
 547         .release = seq_release,
 548 };
 549
 550 #ifdef CONFIG_IP_ROUTE_CLASSID
 551 static int rt_acct_proc_show(struct seq_file *m, void *v)
 552 {
 553         struct ip_rt_acct *dst, *src;
 554         unsigned int i, j;
 555
 556         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 557         if (!dst)
 558                 return -ENOMEM;
 559
 560         for_each_possible_cpu(i) {
 561                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 562                 for (j = 0; j < 256; j++) {
 563                         dst[j].o_bytes   += src[j].o_bytes;
 564                         dst[j].o_packets += src[j].o_packets;
 565                         dst[j].i_bytes   += src[j].i_bytes;
 566                         dst[j].i_packets += src[j].i_packets;
 567                 }
 568         }
 569
 570         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 571         kfree(dst);
 572         return 0;
 573 }
 574
 575 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 576 {
 577         return single_open(file, rt_acct_proc_show, NULL);
 578 }
 579
 580 static const struct file_operations rt_acct_proc_fops = {
 581         .owner          = THIS_MODULE,
 582         .open           = rt_acct_proc_open,
 583         .read           = seq_read,
 584         .llseek         = seq_lseek,
 585         .release        = single_release,
 586 };
 587 #endif
 588
 589 static int __net_init ip_rt_do_proc_init(struct net *net)
 590 {
 591         struct proc_dir_entry *pde;
 592
 593         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 594                         &rt_cache_seq_fops);
 595         if (!pde)
 596                 goto err1;
 597
 598         pde = proc_create("rt_cache", S_IRUGO,
 599                           net->proc_net_stat, &rt_cpu_seq_fops);
 600         if (!pde)
 601                 goto err2;
 602
 603 #ifdef CONFIG_IP_ROUTE_CLASSID
 604         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 605         if (!pde)
 606                 goto err3;
 607 #endif
 608         return 0;
 609
 610 #ifdef CONFIG_IP_ROUTE_CLASSID
 611 err3:
 612         remove_proc_entry("rt_cache", net->proc_net_stat);
 613 #endif
 614 err2:
 615         remove_proc_entry("rt_cache", net->proc_net);
 616 err1:
 617         return -ENOMEM;
 618 }
 619
 620 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 621 {
 622         remove_proc_entry("rt_cache", net->proc_net_stat);
 623         remove_proc_entry("rt_cache", net->proc_net);
 624 #ifdef CONFIG_IP_ROUTE_CLASSID
 625         remove_proc_entry("rt_acct", net->proc_net);
 626 #endif
 627 }
 628
 629 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 630         .init = ip_rt_do_proc_init,
 631         .exit = ip_rt_do_proc_exit,
 632 };
 633
 634 static int __init ip_rt_proc_init(void)
 635 {
 636         return register_pernet_subsys(&ip_rt_proc_ops);
 637 }
 638
 639 #else
 640 static inline int ip_rt_proc_init(void)
 641 {
 642         return 0;
 643 }
 644 #endif /* CONFIG_PROC_FS */
 645
 646 static inline void rt_free(struct rtable *rt)
 647 {
 648         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 649 }
 650
 651 static inline void rt_drop(struct rtable *rt)
 652 {
 653         ip_rt_put(rt);
 654         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 655 }
 656
 657 static inline int rt_fast_clean(struct rtable *rth)
 658 {
 659         /* Kill broadcast/multicast entries very aggresively, if they
 660            collide in hash table with more useful entries */
 661         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 662                 rt_is_input_route(rth) && rth->dst.rt_next;
 663 }
 664
 665 static inline int rt_valuable(struct rtable *rth)
 666 {
 667         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 668                 (rth->peer && rth->peer->pmtu_expires);
 669 }
 670
 671 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 672 {
 673         unsigned long age;
 674         int ret = 0;
 675
 676         if (atomic_read(&rth->dst.__refcnt))
 677                 goto out;
 678
 679         age = jiffies - rth->dst.lastuse;
 680         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 681             (age <= tmo2 && rt_valuable(rth)))
 682                 goto out;
 683         ret = 1;
 684 out:    return ret;
 685 }
 686
 687 /* Bits of score are:
 688  * 31: very valuable
 689  * 30: not quite useless
 690  * 29..0: usage counter
 691  */
 692 static inline u32 rt_score(struct rtable *rt)
 693 {
 694         u32 score = jiffies - rt->dst.lastuse;
 695
 696         score = ~score & ~(3<<30);
 697
 698         if (rt_valuable(rt))
 699                 score |= (1<<31);
 700
 701         if (rt_is_output_route(rt) ||
 702             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 703                 score |= (1<<30);
 704
 705         return score;
 706 }
 707
 708 static inline bool rt_caching(const struct net *net)
 709 {
 710         return net->ipv4.current_rt_cache_rebuild_count <=
 711                 net->ipv4.sysctl_rt_cache_rebuild_count;
 712 }
 713
 714 static inline bool compare_hash_inputs(const struct flowi *fl1,
 715                                         const struct flowi *fl2)
 716 {
 717         return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
 718                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
 719                 (fl1->iif ^ fl2->iif)) == 0);
 720 }
 721
 722 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 723 {
 724         return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
 725                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
 726                 (fl1->mark ^ fl2->mark) |
 727                 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
 728                 (fl1->oif ^ fl2->oif) |
 729                 (fl1->iif ^ fl2->iif)) == 0;
 730 }
 731
 732 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 733 {
 734         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 735 }
 736
 737 static inline int rt_is_expired(struct rtable *rth)
 738 {
 739         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 740 }
 741
 742 /*
 743  * Perform a full scan of hash table and free all entries.
 744  * Can be called by a softirq or a process.
 745  * In the later case, we want to be reschedule if necessary
 746  */
 747 static void rt_do_flush(struct net *net, int process_context)
 748 {
 749         unsigned int i;
 750         struct rtable *rth, *next;
 751
 752         for (i = 0; i <= rt_hash_mask; i++) {
 753                 struct rtable __rcu **pprev;
 754                 struct rtable *list;
 755
 756                 if (process_context && need_resched())
 757                         cond_resched();
 758                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
 759                 if (!rth)
 760                         continue;
 761
 762                 spin_lock_bh(rt_hash_lock_addr(i));
 763
 764                 list = NULL;
 765                 pprev = &rt_hash_table[i].chain;
 766                 rth = rcu_dereference_protected(*pprev,
 767                         lockdep_is_held(rt_hash_lock_addr(i)));
 768
 769                 while (rth) {
 770                         next = rcu_dereference_protected(rth->dst.rt_next,
 771                                 lockdep_is_held(rt_hash_lock_addr(i)));
 772
 773                         if (!net ||
 774                             net_eq(dev_net(rth->dst.dev), net)) {
 775                                 rcu_assign_pointer(*pprev, next);
 776                                 rcu_assign_pointer(rth->dst.rt_next, list);
 777                                 list = rth;
 778                         } else {
 779                                 pprev = &rth->dst.rt_next;
 780                         }
 781                         rth = next;
 782                 }
 783
 784                 spin_unlock_bh(rt_hash_lock_addr(i));
 785
 786                 for (; list; list = next) {
 787                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 788                         rt_free(list);
 789                 }
 790         }
 791 }
 792
 793 /*
 794  * While freeing expired entries, we compute average chain length
 795  * and standard deviation, using fixed-point arithmetic.
 796  * This to have an estimation of rt_chain_length_max
 797  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 798  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 799  */
 800
 801 #define FRACT_BITS 3
 802 #define ONE (1UL << FRACT_BITS)
 803
 804 /*
 805  * Given a hash chain and an item in this hash chain,
 806  * find if a previous entry has the same hash_inputs
 807  * (but differs on tos, mark or oif)
 808  * Returns 0 if an alias is found.
 809  * Returns ONE if rth has no alias before itself.
 810  */
 811 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 812 {
 813         const struct rtable *aux = head;
 814
 815         while (aux != rth) {
 816                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 817                         return 0;
 818                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 819         }
 820         return ONE;
 821 }
 822
 823 /*
 824  * Pertubation of rt_genid by a small quantity [1..256]
 825  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 826  * many times (2^24) without giving recent rt_genid.
 827  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 828  */
 829 static void rt_cache_invalidate(struct net *net)
 830 {
 831         unsigned char shuffle;
 832
 833         get_random_bytes(&shuffle, sizeof(shuffle));
 834         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 835 }
 836
 837 /*
 838  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 839  * delay >= 0 : invalidate & flush cache (can be long)
 840  */
 841 void rt_cache_flush(struct net *net, int delay)
 842 {
 843         rt_cache_invalidate(net);
 844         if (delay >= 0)
 845                 rt_do_flush(net, !in_softirq());
 846 }
 847
 848 /* Flush previous cache invalidated entries from the cache */
 849 void rt_cache_flush_batch(struct net *net)
 850 {
 851         rt_do_flush(net, !in_softirq());
 852 }
 853
 854 static void rt_emergency_hash_rebuild(struct net *net)
 855 {
 856         if (net_ratelimit())
 857                 printk(KERN_WARNING "Route hash chain too long!\n");
 858         rt_cache_invalidate(net);
 859 }
 860
 861 /*
 862    Short description of GC goals.
 863
 864    We want to build algorithm, which will keep routing cache
 865    at some equilibrium point, when number of aged off entries
 866    is kept approximately equal to newly generated ones.
 867
 868    Current expiration strength is variable "expire".
 869    We try to adjust it dynamically, so that if networking
 870    is idle expires is large enough to keep enough of warm entries,
 871    and when load increases it reduces to limit cache size.
 872  */
 873
 874 static int rt_garbage_collect(struct dst_ops *ops)
 875 {
 876         static unsigned long expire = RT_GC_TIMEOUT;
 877         static unsigned long last_gc;
 878         static int rover;
 879         static int equilibrium;
 880         struct rtable *rth;
 881         struct rtable __rcu **rthp;
 882         unsigned long now = jiffies;
 883         int goal;
 884         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 885
 886         /*
 887          * Garbage collection is pretty expensive,
 888          * do not make it too frequently.
 889          */
 890
 891         RT_CACHE_STAT_INC(gc_total);
 892
 893         if (now - last_gc < ip_rt_gc_min_interval &&
 894             entries < ip_rt_max_size) {
 895                 RT_CACHE_STAT_INC(gc_ignored);
 896                 goto out;
 897         }
 898
 899         entries = dst_entries_get_slow(&ipv4_dst_ops);
 900         /* Calculate number of entries, which we want to expire now. */
 901         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 902         if (goal <= 0) {
 903                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 904                         equilibrium = ipv4_dst_ops.gc_thresh;
 905                 goal = entries - equilibrium;
 906                 if (goal > 0) {
 907                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 908                         goal = entries - equilibrium;
 909                 }
 910         } else {
 911                 /* We are in dangerous area. Try to reduce cache really
 912                  * aggressively.
 913                  */
 914                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 915                 equilibrium = entries - goal;
 916         }
 917
 918         if (now - last_gc >= ip_rt_gc_min_interval)
 919                 last_gc = now;
 920
 921         if (goal <= 0) {
 922                 equilibrium += goal;
 923                 goto work_done;
 924         }
 925
 926         do {
 927                 int i, k;
 928
 929                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 930                         unsigned long tmo = expire;
 931
 932                         k = (k + 1) & rt_hash_mask;
 933                         rthp = &rt_hash_table[k].chain;
 934                         spin_lock_bh(rt_hash_lock_addr(k));
 935                         while ((rth = rcu_dereference_protected(*rthp,
 936                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 937                                 if (!rt_is_expired(rth) &&
 938                                         !rt_may_expire(rth, tmo, expire)) {
 939                                         tmo >>= 1;
 940                                         rthp = &rth->dst.rt_next;
 941                                         continue;
 942                                 }
 943                                 *rthp = rth->dst.rt_next;
 944                                 rt_free(rth);
 945                                 goal--;
 946                         }
 947                         spin_unlock_bh(rt_hash_lock_addr(k));
 948                         if (goal <= 0)
 949                                 break;
 950                 }
 951                 rover = k;
 952
 953                 if (goal <= 0)
 954                         goto work_done;
 955
 956                 /* Goal is not achieved. We stop process if:
 957
 958                    - if expire reduced to zero. Otherwise, expire is halfed.
 959                    - if table is not full.
 960                    - if we are called from interrupt.
 961                    - jiffies check is just fallback/debug loop breaker.
 962                      We will not spin here for long time in any case.
 963                  */
 964
 965                 RT_CACHE_STAT_INC(gc_goal_miss);
 966
 967                 if (expire == 0)
 968                         break;
 969
 970                 expire >>= 1;
 971 #if RT_CACHE_DEBUG >= 2
 972                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 973                                 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
 974 #endif
 975
 976                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 977                         goto out;
 978         } while (!in_softirq() && time_before_eq(jiffies, now));
 979
 980         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 981                 goto out;
 982         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 983                 goto out;
 984         if (net_ratelimit())
 985                 printk(KERN_WARNING "dst cache overflow\n");
 986         RT_CACHE_STAT_INC(gc_dst_overflow);
 987         return 1;
 988
 989 work_done:
 990         expire += ip_rt_gc_min_interval;
 991         if (expire > ip_rt_gc_timeout ||
 992             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
 993             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
 994                 expire = ip_rt_gc_timeout;
 995 #if RT_CACHE_DEBUG >= 2
 996         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 997                         dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
 998 #endif
 999 out:    return 0;
1000 }
1001
1002 /*
1003  * Returns number of entries in a hash chain that have different hash_inputs
1004  */
1005 static int slow_chain_length(const struct rtable *head)
1006 {
1007         int length = 0;
1008         const struct rtable *rth = head;
1009
1010         while (rth) {
1011                 length += has_noalias(head, rth);
1012                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1013         }
1014         return length >> FRACT_BITS;
1015 }
1016
1017 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1018                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1019 {
1020         struct rtable   *rth, *cand;
1021         struct rtable __rcu **rthp, **candp;
1022         unsigned long   now;
1023         u32             min_score;
1024         int             chain_length;
1025         int attempts = !in_softirq();
1026
1027 restart:
1028         chain_length = 0;
1029         min_score = ~(u32)0;
1030         cand = NULL;
1031         candp = NULL;
1032         now = jiffies;
1033
1034         if (!rt_caching(dev_net(rt->dst.dev))) {
1035                 /*
1036                  * If we're not caching, just tell the caller we
1037                  * were successful and don't touch the route.  The
1038                  * caller hold the sole reference to the cache entry, and
1039                  * it will be released when the caller is done with it.
1040                  * If we drop it here, the callers have no way to resolve routes
1041                  * when we're not caching.  Instead, just point *rp at rt, so
1042                  * the caller gets a single use out of the route
1043                  * Note that we do rt_free on this new route entry, so that
1044                  * once its refcount hits zero, we are still able to reap it
1045                  * (Thanks Alexey)
1046                  * Note: To avoid expensive rcu stuff for this uncached dst,
1047                  * we set DST_NOCACHE so that dst_release() can free dst without
1048                  * waiting a grace period.
1049                  */
1050
1051                 rt->dst.flags |= DST_NOCACHE;
1052                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1053                         int err = arp_bind_neighbour(&rt->dst);
1054                         if (err) {
1055                                 if (net_ratelimit())
1056                                         printk(KERN_WARNING
1057                                             "Neighbour table failure & not caching routes.\n");
1058                                 ip_rt_put(rt);
1059                                 return err;
1060                         }
1061                 }
1062
1063                 goto skip_hashing;
1064         }
1065
1066         rthp = &rt_hash_table[hash].chain;
1067
1068         spin_lock_bh(rt_hash_lock_addr(hash));
1069         while ((rth = rcu_dereference_protected(*rthp,
1070                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1071                 if (rt_is_expired(rth)) {
1072                         *rthp = rth->dst.rt_next;
1073                         rt_free(rth);
1074                         continue;
1075                 }
1076                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1077                         /* Put it first */
1078                         *rthp = rth->dst.rt_next;
1079                         /*
1080                          * Since lookup is lockfree, the deletion
1081                          * must be visible to another weakly ordered CPU before
1082                          * the insertion at the start of the hash chain.
1083                          */
1084                         rcu_assign_pointer(rth->dst.rt_next,
1085                                            rt_hash_table[hash].chain);
1086                         /*
1087                          * Since lookup is lockfree, the update writes
1088                          * must be ordered for consistency on SMP.
1089                          */
1090                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1091
1092                         dst_use(&rth->dst, now);
1093                         spin_unlock_bh(rt_hash_lock_addr(hash));
1094
1095                         rt_drop(rt);
1096                         if (rp)
1097                                 *rp = rth;
1098                         else
1099                                 skb_dst_set(skb, &rth->dst);
1100                         return 0;
1101                 }
1102
1103                 if (!atomic_read(&rth->dst.__refcnt)) {
1104                         u32 score = rt_score(rth);
1105
1106                         if (score <= min_score) {
1107                                 cand = rth;
1108                                 candp = rthp;
1109                                 min_score = score;
1110                         }
1111                 }
1112
1113                 chain_length++;
1114
1115                 rthp = &rth->dst.rt_next;
1116         }
1117
1118         if (cand) {
1119                 /* ip_rt_gc_elasticity used to be average length of chain
1120                  * length, when exceeded gc becomes really aggressive.
1121                  *
1122                  * The second limit is less certain. At the moment it allows
1123                  * only 2 entries per bucket. We will see.
1124                  */
1125                 if (chain_length > ip_rt_gc_elasticity) {
1126                         *candp = cand->dst.rt_next;
1127                         rt_free(cand);
1128                 }
1129         } else {
1130                 if (chain_length > rt_chain_length_max &&
1131                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1132                         struct net *net = dev_net(rt->dst.dev);
1133                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1134                         if (!rt_caching(net)) {
1135                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1136                                         rt->dst.dev->name, num);
1137                         }
1138                         rt_emergency_hash_rebuild(net);
1139                         spin_unlock_bh(rt_hash_lock_addr(hash));
1140
1141                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1142                                         ifindex, rt_genid(net));
1143                         goto restart;
1144                 }
1145         }
1146
1147         /* Try to bind route to arp only if it is output
1148            route or unicast forwarding path.
1149          */
1150         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1151                 int err = arp_bind_neighbour(&rt->dst);
1152                 if (err) {
1153                         spin_unlock_bh(rt_hash_lock_addr(hash));
1154
1155                         if (err != -ENOBUFS) {
1156                                 rt_drop(rt);
1157                                 return err;
1158                         }
1159
1160                         /* Neighbour tables are full and nothing
1161                            can be released. Try to shrink route cache,
1162                            it is most likely it holds some neighbour records.
1163                          */
1164                         if (attempts-- > 0) {
1165                                 int saved_elasticity = ip_rt_gc_elasticity;
1166                                 int saved_int = ip_rt_gc_min_interval;
1167                                 ip_rt_gc_elasticity     = 1;
1168                                 ip_rt_gc_min_interval   = 0;
1169                                 rt_garbage_collect(&ipv4_dst_ops);
1170                                 ip_rt_gc_min_interval   = saved_int;
1171                                 ip_rt_gc_elasticity     = saved_elasticity;
1172                                 goto restart;
1173                         }
1174
1175                         if (net_ratelimit())
1176                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1177                         rt_drop(rt);
1178                         return -ENOBUFS;
1179                 }
1180         }
1181
1182         rt->dst.rt_next = rt_hash_table[hash].chain;
1183
1184 #if RT_CACHE_DEBUG >= 2
1185         if (rt->dst.rt_next) {
1186                 struct rtable *trt;
1187                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1188                        hash, &rt->rt_dst);
1189                 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1190                         printk(" . %pI4", &trt->rt_dst);
1191                 printk("\n");
1192         }
1193 #endif
1194         /*
1195          * Since lookup is lockfree, we must make sure
1196          * previous writes to rt are comitted to memory
1197          * before making rt visible to other CPUS.
1198          */
1199         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1200
1201         spin_unlock_bh(rt_hash_lock_addr(hash));
1202
1203 skip_hashing:
1204         if (rp)
1205                 *rp = rt;
1206         else
1207                 skb_dst_set(skb, &rt->dst);
1208         return 0;
1209 }
1210
1211 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1212
1213 static u32 rt_peer_genid(void)
1214 {
1215         return atomic_read(&__rt_peer_genid);
1216 }
1217
1218 void rt_bind_peer(struct rtable *rt, int create)
1219 {
1220         struct inet_peer *peer;
1221
1222         peer = inet_getpeer_v4(rt->rt_dst, create);
1223
1224         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1225                 inet_putpeer(peer);
1226         else
1227                 rt->rt_peer_genid = rt_peer_genid();
1228 }
1229
1230 /*
1231  * Peer allocation may fail only in serious out-of-memory conditions.  However
1232  * we still can generate some output.
1233  * Random ID selection looks a bit dangerous because we have no chances to
1234  * select ID being unique in a reasonable period of time.
1235  * But broken packet identifier may be better than no packet at all.
1236  */
1237 static void ip_select_fb_ident(struct iphdr *iph)
1238 {
1239         static DEFINE_SPINLOCK(ip_fb_id_lock);
1240         static u32 ip_fallback_id;
1241         u32 salt;
1242
1243         spin_lock_bh(&ip_fb_id_lock);
1244         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1245         iph->id = htons(salt & 0xFFFF);
1246         ip_fallback_id = salt;
1247         spin_unlock_bh(&ip_fb_id_lock);
1248 }
1249
1250 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1251 {
1252         struct rtable *rt = (struct rtable *) dst;
1253
1254         if (rt) {
1255                 if (rt->peer == NULL)
1256                         rt_bind_peer(rt, 1);
1257
1258                 /* If peer is attached to destination, it is never detached,
1259                    so that we need not to grab a lock to dereference it.
1260                  */
1261                 if (rt->peer) {
1262                         iph->id = htons(inet_getid(rt->peer, more));
1263                         return;
1264                 }
1265         } else
1266                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1267                        __builtin_return_address(0));
1268
1269         ip_select_fb_ident(iph);
1270 }
1271 EXPORT_SYMBOL(__ip_select_ident);
1272
1273 static void rt_del(unsigned hash, struct rtable *rt)
1274 {
1275         struct rtable __rcu **rthp;
1276         struct rtable *aux;
1277
1278         rthp = &rt_hash_table[hash].chain;
1279         spin_lock_bh(rt_hash_lock_addr(hash));
1280         ip_rt_put(rt);
1281         while ((aux = rcu_dereference_protected(*rthp,
1282                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1283                 if (aux == rt || rt_is_expired(aux)) {
1284                         *rthp = aux->dst.rt_next;
1285                         rt_free(aux);
1286                         continue;
1287                 }
1288                 rthp = &aux->dst.rt_next;
1289         }
1290         spin_unlock_bh(rt_hash_lock_addr(hash));
1291 }
1292
1293 /* called in rcu_read_lock() section */
1294 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1295                     __be32 saddr, struct net_device *dev)
1296 {
1297         int i, k;
1298         struct in_device *in_dev = __in_dev_get_rcu(dev);
1299         struct rtable *rth;
1300         struct rtable __rcu **rthp;
1301         __be32  skeys[2] = { saddr, 0 };
1302         int  ikeys[2] = { dev->ifindex, 0 };
1303         struct netevent_redirect netevent;
1304         struct net *net;
1305
1306         if (!in_dev)
1307                 return;
1308
1309         net = dev_net(dev);
1310         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1311             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1312             ipv4_is_zeronet(new_gw))
1313                 goto reject_redirect;
1314
1315         if (!rt_caching(net))
1316                 goto reject_redirect;
1317
1318         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1319                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1320                         goto reject_redirect;
1321                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1322                         goto reject_redirect;
1323         } else {
1324                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1325                         goto reject_redirect;
1326         }
1327
1328         for (i = 0; i < 2; i++) {
1329                 for (k = 0; k < 2; k++) {
1330                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1331                                                 rt_genid(net));
1332
1333                         rthp = &rt_hash_table[hash].chain;
1334
1335                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1336                                 struct rtable *rt;
1337
1338                                 if (rth->fl.fl4_dst != daddr ||
1339                                     rth->fl.fl4_src != skeys[i] ||
1340                                     rth->fl.oif != ikeys[k] ||
1341                                     rt_is_input_route(rth) ||
1342                                     rt_is_expired(rth) ||
1343                                     !net_eq(dev_net(rth->dst.dev), net)) {
1344                                         rthp = &rth->dst.rt_next;
1345                                         continue;
1346                                 }
1347
1348                                 if (rth->rt_dst != daddr ||
1349                                     rth->rt_src != saddr ||
1350                                     rth->dst.error ||
1351                                     rth->rt_gateway != old_gw ||
1352                                     rth->dst.dev != dev)
1353                                         break;
1354
1355                                 dst_hold(&rth->dst);
1356
1357                                 rt = dst_alloc(&ipv4_dst_ops);
1358                                 if (rt == NULL) {
1359                                         ip_rt_put(rth);
1360                                         return;
1361                                 }
1362
1363                                 /* Copy all the information. */
1364                                 *rt = *rth;
1365                                 rt->dst.__use           = 1;
1366                                 atomic_set(&rt->dst.__refcnt, 1);
1367                                 rt->dst.child           = NULL;
1368                                 if (rt->dst.dev)
1369                                         dev_hold(rt->dst.dev);
1370                                 rt->dst.obsolete        = -1;
1371                                 rt->dst.lastuse = jiffies;
1372                                 rt->dst.path            = &rt->dst;
1373                                 rt->dst.neighbour       = NULL;
1374                                 rt->dst.hh              = NULL;
1375 #ifdef CONFIG_XFRM
1376                                 rt->dst.xfrm            = NULL;
1377 #endif
1378                                 rt->rt_genid            = rt_genid(net);
1379                                 rt->rt_flags            |= RTCF_REDIRECTED;
1380
1381                                 /* Gateway is different ... */
1382                                 rt->rt_gateway          = new_gw;
1383
1384                                 /* Redirect received -> path was valid */
1385                                 dst_confirm(&rth->dst);
1386
1387                                 if (rt->peer)
1388                                         atomic_inc(&rt->peer->refcnt);
1389                                 if (rt->fi)
1390                                         atomic_inc(&rt->fi->fib_clntref);
1391
1392                                 if (arp_bind_neighbour(&rt->dst) ||
1393                                     !(rt->dst.neighbour->nud_state &
1394                                             NUD_VALID)) {
1395                                         if (rt->dst.neighbour)
1396                                                 neigh_event_send(rt->dst.neighbour, NULL);
1397                                         ip_rt_put(rth);
1398                                         rt_drop(rt);
1399                                         goto do_next;
1400                                 }
1401
1402                                 netevent.old = &rth->dst;
1403                                 netevent.new = &rt->dst;
1404                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1405                                                         &netevent);
1406
1407                                 rt_del(hash, rth);
1408                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1409                                         ip_rt_put(rt);
1410                                 goto do_next;
1411                         }
1412                 do_next:
1413                         ;
1414                 }
1415         }
1416         return;
1417
1418 reject_redirect:
1419 #ifdef CONFIG_IP_ROUTE_VERBOSE
1420         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1421                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1422                         "  Advised path = %pI4 -> %pI4\n",
1423                        &old_gw, dev->name, &new_gw,
1424                        &saddr, &daddr);
1425 #endif
1426         ;
1427 }
1428
1429 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1430 {
1431         struct rtable *rt = (struct rtable *)dst;
1432         struct dst_entry *ret = dst;
1433
1434         if (rt) {
1435                 if (dst->obsolete > 0) {
1436                         ip_rt_put(rt);
1437                         ret = NULL;
1438                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1439                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1440                                                 rt->fl.oif,
1441                                                 rt_genid(dev_net(dst->dev)));
1442 #if RT_CACHE_DEBUG >= 1
1443                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1444                                 &rt->rt_dst, rt->fl.fl4_tos);
1445 #endif
1446                         rt_del(hash, rt);
1447                         ret = NULL;
1448                 } else if (rt->peer &&
1449                            rt->peer->pmtu_expires &&
1450                            time_after_eq(jiffies, rt->peer->pmtu_expires)) {
1451                         unsigned long orig = rt->peer->pmtu_expires;
1452
1453                         if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1454                                 dst_metric_set(dst, RTAX_MTU,
1455                                                rt->peer->pmtu_orig);
1456                 }
1457         }
1458         return ret;
1459 }
1460
1461 /*
1462  * Algorithm:
1463  *      1. The first ip_rt_redirect_number redirects are sent
1464  *         with exponential backoff, then we stop sending them at all,
1465  *         assuming that the host ignores our redirects.
1466  *      2. If we did not see packets requiring redirects
1467  *         during ip_rt_redirect_silence, we assume that the host
1468  *         forgot redirected route and start to send redirects again.
1469  *
1470  * This algorithm is much cheaper and more intelligent than dumb load limiting
1471  * in icmp.c.
1472  *
1473  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1474  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1475  */
1476
1477 void ip_rt_send_redirect(struct sk_buff *skb)
1478 {
1479         struct rtable *rt = skb_rtable(skb);
1480         struct in_device *in_dev;
1481         struct inet_peer *peer;
1482         int log_martians;
1483
1484         rcu_read_lock();
1485         in_dev = __in_dev_get_rcu(rt->dst.dev);
1486         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1487                 rcu_read_unlock();
1488                 return;
1489         }
1490         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1491         rcu_read_unlock();
1492
1493         if (!rt->peer)
1494                 rt_bind_peer(rt, 1);
1495         peer = rt->peer;
1496         if (!peer) {
1497                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1498                 return;
1499         }
1500
1501         /* No redirected packets during ip_rt_redirect_silence;
1502          * reset the algorithm.
1503          */
1504         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1505                 peer->rate_tokens = 0;
1506
1507         /* Too many ignored redirects; do not send anything
1508          * set dst.rate_last to the last seen redirected packet.
1509          */
1510         if (peer->rate_tokens >= ip_rt_redirect_number) {
1511                 peer->rate_last = jiffies;
1512                 return;
1513         }
1514
1515         /* Check for load limit; set rate_last to the latest sent
1516          * redirect.
1517          */
1518         if (peer->rate_tokens == 0 ||
1519             time_after(jiffies,
1520                        (peer->rate_last +
1521                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1522                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1523                 peer->rate_last = jiffies;
1524                 ++peer->rate_tokens;
1525 #ifdef CONFIG_IP_ROUTE_VERBOSE
1526                 if (log_martians &&
1527                     peer->rate_tokens == ip_rt_redirect_number &&
1528                     net_ratelimit())
1529                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1530                                 &rt->rt_src, rt->rt_iif,
1531                                 &rt->rt_dst, &rt->rt_gateway);
1532 #endif
1533         }
1534 }
1535
1536 static int ip_error(struct sk_buff *skb)
1537 {
1538         struct rtable *rt = skb_rtable(skb);
1539         struct inet_peer *peer;
1540         unsigned long now;
1541         bool send;
1542         int code;
1543
1544         switch (rt->dst.error) {
1545                 case EINVAL:
1546                 default:
1547                         goto out;
1548                 case EHOSTUNREACH:
1549                         code = ICMP_HOST_UNREACH;
1550                         break;
1551                 case ENETUNREACH:
1552                         code = ICMP_NET_UNREACH;
1553                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1554                                         IPSTATS_MIB_INNOROUTES);
1555                         break;
1556                 case EACCES:
1557                         code = ICMP_PKT_FILTERED;
1558                         break;
1559         }
1560
1561         if (!rt->peer)
1562                 rt_bind_peer(rt, 1);
1563         peer = rt->peer;
1564
1565         send = true;
1566         if (peer) {
1567                 now = jiffies;
1568                 peer->rate_tokens += now - peer->rate_last;
1569                 if (peer->rate_tokens > ip_rt_error_burst)
1570                         peer->rate_tokens = ip_rt_error_burst;
1571                 peer->rate_last = now;
1572                 if (peer->rate_tokens >= ip_rt_error_cost)
1573                         peer->rate_tokens -= ip_rt_error_cost;
1574                 else
1575                         send = false;
1576         }
1577         if (send)
1578                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1579
1580 out:    kfree_skb(skb);
1581         return 0;
1582 }
1583
1584 /*
1585  *      The last two values are not from the RFC but
1586  *      are needed for AMPRnet AX.25 paths.
1587  */
1588
1589 static const unsigned short mtu_plateau[] =
1590 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1591
1592 static inline unsigned short guess_mtu(unsigned short old_mtu)
1593 {
1594         int i;
1595
1596         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1597                 if (old_mtu > mtu_plateau[i])
1598                         return mtu_plateau[i];
1599         return 68;
1600 }
1601
1602 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1603                                  unsigned short new_mtu,
1604                                  struct net_device *dev)
1605 {
1606         unsigned short old_mtu = ntohs(iph->tot_len);
1607         unsigned short est_mtu = 0;
1608         struct inet_peer *peer;
1609
1610         peer = inet_getpeer_v4(iph->daddr, 1);
1611         if (peer) {
1612                 unsigned short mtu = new_mtu;
1613
1614                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1615                         /* BSD 4.2 derived systems incorrectly adjust
1616                          * tot_len by the IP header length, and report
1617                          * a zero MTU in the ICMP message.
1618                          */
1619                         if (mtu == 0 &&
1620                             old_mtu >= 68 + (iph->ihl << 2))
1621                                 old_mtu -= iph->ihl << 2;
1622                         mtu = guess_mtu(old_mtu);
1623                 }
1624
1625                 if (mtu < ip_rt_min_pmtu)
1626                         mtu = ip_rt_min_pmtu;
1627                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1628                         est_mtu = mtu;
1629                         peer->pmtu_learned = mtu;
1630                         peer->pmtu_expires = jiffies + ip_rt_mtu_expires;
1631                 }
1632
1633                 inet_putpeer(peer);
1634
1635                 atomic_inc(&__rt_peer_genid);
1636         }
1637         return est_mtu ? : new_mtu;
1638 }
1639
1640 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1641 {
1642         unsigned long expires = peer->pmtu_expires;
1643
1644         if (time_before(expires, jiffies)) {
1645                 u32 orig_dst_mtu = dst_mtu(dst);
1646                 if (peer->pmtu_learned < orig_dst_mtu) {
1647                         if (!peer->pmtu_orig)
1648                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1649                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1650                 }
1651         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1652                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1653 }
1654
1655 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1656 {
1657         struct rtable *rt = (struct rtable *) dst;
1658         struct inet_peer *peer;
1659
1660         dst_confirm(dst);
1661
1662         if (!rt->peer)
1663                 rt_bind_peer(rt, 1);
1664         peer = rt->peer;
1665         if (peer) {
1666                 if (mtu < ip_rt_min_pmtu)
1667                         mtu = ip_rt_min_pmtu;
1668                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1669                         peer->pmtu_learned = mtu;
1670                         peer->pmtu_expires = jiffies + ip_rt_mtu_expires;
1671
1672                         atomic_inc(&__rt_peer_genid);
1673                         rt->rt_peer_genid = rt_peer_genid();
1674
1675                         check_peer_pmtu(dst, peer);
1676                 }
1677                 inet_putpeer(peer);
1678         }
1679 }
1680
1681 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1682 {
1683         struct rtable *rt = (struct rtable *) dst;
1684
1685         if (rt_is_expired(rt))
1686                 return NULL;
1687         if (rt->rt_peer_genid != rt_peer_genid()) {
1688                 struct inet_peer *peer;
1689
1690                 if (!rt->peer)
1691                         rt_bind_peer(rt, 0);
1692
1693                 peer = rt->peer;
1694                 if (peer && peer->pmtu_expires)
1695                         check_peer_pmtu(dst, peer);
1696
1697                 rt->rt_peer_genid = rt_peer_genid();
1698         }
1699         return dst;
1700 }
1701
1702 static void ipv4_dst_destroy(struct dst_entry *dst)
1703 {
1704         struct rtable *rt = (struct rtable *) dst;
1705         struct inet_peer *peer = rt->peer;
1706
1707         if (rt->fi) {
1708                 fib_info_put(rt->fi);
1709                 rt->fi = NULL;
1710         }
1711         if (peer) {
1712                 rt->peer = NULL;
1713                 inet_putpeer(peer);
1714         }
1715 }
1716
1717
1718 static void ipv4_link_failure(struct sk_buff *skb)
1719 {
1720         struct rtable *rt;
1721
1722         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1723
1724         rt = skb_rtable(skb);
1725         if (rt &&
1726             rt->peer &&
1727             rt->peer->pmtu_expires) {
1728                 unsigned long orig = rt->peer->pmtu_expires;
1729
1730                 if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1731                         dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1732         }
1733 }
1734
1735 static int ip_rt_bug(struct sk_buff *skb)
1736 {
1737         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1738                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1739                 skb->dev ? skb->dev->name : "?");
1740         kfree_skb(skb);
1741         return 0;
1742 }
1743
1744 /*
1745    We do not cache source address of outgoing interface,
1746    because it is used only by IP RR, TS and SRR options,
1747    so that it out of fast path.
1748
1749    BTW remember: "addr" is allowed to be not aligned
1750    in IP options!
1751  */
1752
1753 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1754 {
1755         __be32 src;
1756         struct fib_result res;
1757
1758         if (rt_is_output_route(rt))
1759                 src = rt->rt_src;
1760         else {
1761                 rcu_read_lock();
1762                 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1763                         src = FIB_RES_PREFSRC(res);
1764                 else
1765                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1766                                         RT_SCOPE_UNIVERSE);
1767                 rcu_read_unlock();
1768         }
1769         memcpy(addr, &src, 4);
1770 }
1771
1772 #ifdef CONFIG_IP_ROUTE_CLASSID
1773 static void set_class_tag(struct rtable *rt, u32 tag)
1774 {
1775         if (!(rt->dst.tclassid & 0xFFFF))
1776                 rt->dst.tclassid |= tag & 0xFFFF;
1777         if (!(rt->dst.tclassid & 0xFFFF0000))
1778                 rt->dst.tclassid |= tag & 0xFFFF0000;
1779 }
1780 #endif
1781
1782 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1783 {
1784         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1785
1786         if (advmss == 0) {
1787                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1788                                ip_rt_min_advmss);
1789                 if (advmss > 65535 - 40)
1790                         advmss = 65535 - 40;
1791         }
1792         return advmss;
1793 }
1794
1795 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1796 {
1797         unsigned int mtu = dst->dev->mtu;
1798
1799         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1800                 const struct rtable *rt = (const struct rtable *) dst;
1801
1802                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1803                         mtu = 576;
1804         }
1805
1806         if (mtu > IP_MAX_MTU)
1807                 mtu = IP_MAX_MTU;
1808
1809         return mtu;
1810 }
1811
1812 static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
1813 {
1814         struct inet_peer *peer;
1815         int create = 0;
1816
1817         /* If a peer entry exists for this destination, we must hook
1818          * it up in order to get at cached metrics.
1819          */
1820         if (rt->fl.flags & FLOWI_FLAG_PRECOW_METRICS)
1821                 create = 1;
1822
1823         rt_bind_peer(rt, create);
1824         peer = rt->peer;
1825         if (peer) {
1826                 if (inet_metrics_new(peer))
1827                         memcpy(peer->metrics, fi->fib_metrics,
1828                                sizeof(u32) * RTAX_MAX);
1829                 dst_init_metrics(&rt->dst, peer->metrics, false);
1830
1831                 if (peer->pmtu_expires)
1832                         check_peer_pmtu(&rt->dst, peer);
1833         } else {
1834                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1835                         rt->fi = fi;
1836                         atomic_inc(&fi->fib_clntref);
1837                 }
1838                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1839         }
1840 }
1841
1842 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1843 {
1844         struct dst_entry *dst = &rt->dst;
1845         struct fib_info *fi = res->fi;
1846
1847         if (fi) {
1848                 if (FIB_RES_GW(*res) &&
1849                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1850                         rt->rt_gateway = FIB_RES_GW(*res);
1851                 rt_init_metrics(rt, fi);
1852 #ifdef CONFIG_IP_ROUTE_CLASSID
1853                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1854 #endif
1855         }
1856
1857         if (dst_mtu(dst) > IP_MAX_MTU)
1858                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1859         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1860                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1861
1862 #ifdef CONFIG_IP_ROUTE_CLASSID
1863 #ifdef CONFIG_IP_MULTIPLE_TABLES
1864         set_class_tag(rt, fib_rules_tclass(res));
1865 #endif
1866         set_class_tag(rt, itag);
1867 #endif
1868         rt->rt_type = res->type;
1869 }
1870
1871 /* called in rcu_read_lock() section */
1872 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1873                                 u8 tos, struct net_device *dev, int our)
1874 {
1875         unsigned int hash;
1876         struct rtable *rth;
1877         __be32 spec_dst;
1878         struct in_device *in_dev = __in_dev_get_rcu(dev);
1879         u32 itag = 0;
1880         int err;
1881
1882         /* Primary sanity checks. */
1883
1884         if (in_dev == NULL)
1885                 return -EINVAL;
1886
1887         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1888             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1889                 goto e_inval;
1890
1891         if (ipv4_is_zeronet(saddr)) {
1892                 if (!ipv4_is_local_multicast(daddr))
1893                         goto e_inval;
1894                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1895         } else {
1896                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1897                                           &itag, 0);
1898                 if (err < 0)
1899                         goto e_err;
1900         }
1901         rth = dst_alloc(&ipv4_dst_ops);
1902         if (!rth)
1903                 goto e_nobufs;
1904
1905         rth->dst.output = ip_rt_bug;
1906         rth->dst.obsolete = -1;
1907
1908         atomic_set(&rth->dst.__refcnt, 1);
1909         rth->dst.flags= DST_HOST;
1910         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1911                 rth->dst.flags |= DST_NOPOLICY;
1912         rth->fl.fl4_dst = daddr;
1913         rth->rt_dst     = daddr;
1914         rth->fl.fl4_tos = tos;
1915         rth->fl.mark    = skb->mark;
1916         rth->fl.fl4_src = saddr;
1917         rth->rt_src     = saddr;
1918 #ifdef CONFIG_IP_ROUTE_CLASSID
1919         rth->dst.tclassid = itag;
1920 #endif
1921         rth->rt_iif     =
1922         rth->fl.iif     = dev->ifindex;
1923         rth->dst.dev    = init_net.loopback_dev;
1924         dev_hold(rth->dst.dev);
1925         rth->fl.oif     = 0;
1926         rth->rt_gateway = daddr;
1927         rth->rt_spec_dst= spec_dst;
1928         rth->rt_genid   = rt_genid(dev_net(dev));
1929         rth->rt_flags   = RTCF_MULTICAST;
1930         rth->rt_type    = RTN_MULTICAST;
1931         if (our) {
1932                 rth->dst.input= ip_local_deliver;
1933                 rth->rt_flags |= RTCF_LOCAL;
1934         }
1935
1936 #ifdef CONFIG_IP_MROUTE
1937         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1938                 rth->dst.input = ip_mr_input;
1939 #endif
1940         RT_CACHE_STAT_INC(in_slow_mc);
1941
1942         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1943         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1944
1945 e_nobufs:
1946         return -ENOBUFS;
1947 e_inval:
1948         return -EINVAL;
1949 e_err:
1950         return err;
1951 }
1952
1953
1954 static void ip_handle_martian_source(struct net_device *dev,
1955                                      struct in_device *in_dev,
1956                                      struct sk_buff *skb,
1957                                      __be32 daddr,
1958                                      __be32 saddr)
1959 {
1960         RT_CACHE_STAT_INC(in_martian_src);
1961 #ifdef CONFIG_IP_ROUTE_VERBOSE
1962         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1963                 /*
1964                  *      RFC1812 recommendation, if source is martian,
1965                  *      the only hint is MAC header.
1966                  */
1967                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1968                         &daddr, &saddr, dev->name);
1969                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1970                         int i;
1971                         const unsigned char *p = skb_mac_header(skb);
1972                         printk(KERN_WARNING "ll header: ");
1973                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1974                                 printk("%02x", *p);
1975                                 if (i < (dev->hard_header_len - 1))
1976                                         printk(":");
1977                         }
1978                         printk("\n");
1979                 }
1980         }
1981 #endif
1982 }
1983
1984 /* called in rcu_read_lock() section */
1985 static int __mkroute_input(struct sk_buff *skb,
1986                            struct fib_result *res,
1987                            struct in_device *in_dev,
1988                            __be32 daddr, __be32 saddr, u32 tos,
1989                            struct rtable **result)
1990 {
1991         struct rtable *rth;
1992         int err;
1993         struct in_device *out_dev;
1994         unsigned int flags = 0;
1995         __be32 spec_dst;
1996         u32 itag;
1997
1998         /* get a working reference to the output device */
1999         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2000         if (out_dev == NULL) {
2001                 if (net_ratelimit())
2002                         printk(KERN_CRIT "Bug in ip_route_input" \
2003                                "_slow(). Please, report\n");
2004                 return -EINVAL;
2005         }
2006
2007
2008         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
2009                                   in_dev->dev, &spec_dst, &itag, skb->mark);
2010         if (err < 0) {
2011                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2012                                          saddr);
2013
2014                 goto cleanup;
2015         }
2016
2017         if (err)
2018                 flags |= RTCF_DIRECTSRC;
2019
2020         if (out_dev == in_dev && err &&
2021             (IN_DEV_SHARED_MEDIA(out_dev) ||
2022              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2023                 flags |= RTCF_DOREDIRECT;
2024
2025         if (skb->protocol != htons(ETH_P_IP)) {
2026                 /* Not IP (i.e. ARP). Do not create route, if it is
2027                  * invalid for proxy arp. DNAT routes are always valid.
2028                  *
2029                  * Proxy arp feature have been extended to allow, ARP
2030                  * replies back to the same interface, to support
2031                  * Private VLAN switch technologies. See arp.c.
2032                  */
2033                 if (out_dev == in_dev &&
2034                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2035                         err = -EINVAL;
2036                         goto cleanup;
2037                 }
2038         }
2039
2040
2041         rth = dst_alloc(&ipv4_dst_ops);
2042         if (!rth) {
2043                 err = -ENOBUFS;
2044                 goto cleanup;
2045         }
2046
2047         atomic_set(&rth->dst.__refcnt, 1);
2048         rth->dst.flags= DST_HOST;
2049         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2050                 rth->dst.flags |= DST_NOPOLICY;
2051         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2052                 rth->dst.flags |= DST_NOXFRM;
2053         rth->fl.fl4_dst = daddr;
2054         rth->rt_dst     = daddr;
2055         rth->fl.fl4_tos = tos;
2056         rth->fl.mark    = skb->mark;
2057         rth->fl.fl4_src = saddr;
2058         rth->rt_src     = saddr;
2059         rth->rt_gateway = daddr;
2060         rth->rt_iif     =
2061                 rth->fl.iif     = in_dev->dev->ifindex;
2062         rth->dst.dev    = (out_dev)->dev;
2063         dev_hold(rth->dst.dev);
2064         rth->fl.oif     = 0;
2065         rth->rt_spec_dst= spec_dst;
2066
2067         rth->dst.obsolete = -1;
2068         rth->dst.input = ip_forward;
2069         rth->dst.output = ip_output;
2070         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2071
2072         rt_set_nexthop(rth, res, itag);
2073
2074         rth->rt_flags = flags;
2075
2076         *result = rth;
2077         err = 0;
2078  cleanup:
2079         return err;
2080 }
2081
2082 static int ip_mkroute_input(struct sk_buff *skb,
2083                             struct fib_result *res,
2084                             const struct flowi *fl,
2085                             struct in_device *in_dev,
2086                             __be32 daddr, __be32 saddr, u32 tos)
2087 {
2088         struct rtable* rth = NULL;
2089         int err;
2090         unsigned hash;
2091
2092 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2093         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2094                 fib_select_multipath(fl, res);
2095 #endif
2096
2097         /* create a routing cache entry */
2098         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2099         if (err)
2100                 return err;
2101
2102         /* put it into the cache */
2103         hash = rt_hash(daddr, saddr, fl->iif,
2104                        rt_genid(dev_net(rth->dst.dev)));
2105         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2106 }
2107
2108 /*
2109  *      NOTE. We drop all the packets that has local source
2110  *      addresses, because every properly looped back packet
2111  *      must have correct destination already attached by output routine.
2112  *
2113  *      Such approach solves two big problems:
2114  *      1. Not simplex devices are handled properly.
2115  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2116  *      called with rcu_read_lock()
2117  */
2118
2119 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2120                                u8 tos, struct net_device *dev)
2121 {
2122         struct fib_result res;
2123         struct in_device *in_dev = __in_dev_get_rcu(dev);
2124         struct flowi fl = { .fl4_dst    = daddr,
2125                             .fl4_src    = saddr,
2126                             .fl4_tos    = tos,
2127                             .fl4_scope  = RT_SCOPE_UNIVERSE,
2128                             .mark = skb->mark,
2129                             .iif = dev->ifindex };
2130         unsigned        flags = 0;
2131         u32             itag = 0;
2132         struct rtable * rth;
2133         unsigned        hash;
2134         __be32          spec_dst;
2135         int             err = -EINVAL;
2136         struct net    * net = dev_net(dev);
2137
2138         /* IP on this device is disabled. */
2139
2140         if (!in_dev)
2141                 goto out;
2142
2143         /* Check for the most weird martians, which can be not detected
2144            by fib_lookup.
2145          */
2146
2147         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2148             ipv4_is_loopback(saddr))
2149                 goto martian_source;
2150
2151         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2152                 goto brd_input;
2153
2154         /* Accept zero addresses only to limited broadcast;
2155          * I even do not know to fix it or not. Waiting for complains :-)
2156          */
2157         if (ipv4_is_zeronet(saddr))
2158                 goto martian_source;
2159
2160         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2161                 goto martian_destination;
2162
2163         /*
2164          *      Now we are ready to route packet.
2165          */
2166         err = fib_lookup(net, &fl, &res);
2167         if (err != 0) {
2168                 if (!IN_DEV_FORWARD(in_dev))
2169                         goto e_hostunreach;
2170                 goto no_route;
2171         }
2172
2173         RT_CACHE_STAT_INC(in_slow_tot);
2174
2175         if (res.type == RTN_BROADCAST)
2176                 goto brd_input;
2177
2178         if (res.type == RTN_LOCAL) {
2179                 err = fib_validate_source(saddr, daddr, tos,
2180                                           net->loopback_dev->ifindex,
2181                                           dev, &spec_dst, &itag, skb->mark);
2182                 if (err < 0)
2183                         goto martian_source_keep_err;
2184                 if (err)
2185                         flags |= RTCF_DIRECTSRC;
2186                 spec_dst = daddr;
2187                 goto local_input;
2188         }
2189
2190         if (!IN_DEV_FORWARD(in_dev))
2191                 goto e_hostunreach;
2192         if (res.type != RTN_UNICAST)
2193                 goto martian_destination;
2194
2195         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2196 out:    return err;
2197
2198 brd_input:
2199         if (skb->protocol != htons(ETH_P_IP))
2200                 goto e_inval;
2201
2202         if (ipv4_is_zeronet(saddr))
2203                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2204         else {
2205                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2206                                           &itag, skb->mark);
2207                 if (err < 0)
2208                         goto martian_source_keep_err;
2209                 if (err)
2210                         flags |= RTCF_DIRECTSRC;
2211         }
2212         flags |= RTCF_BROADCAST;
2213         res.type = RTN_BROADCAST;
2214         RT_CACHE_STAT_INC(in_brd);
2215
2216 local_input:
2217         rth = dst_alloc(&ipv4_dst_ops);
2218         if (!rth)
2219                 goto e_nobufs;
2220
2221         rth->dst.output= ip_rt_bug;
2222         rth->dst.obsolete = -1;
2223         rth->rt_genid = rt_genid(net);
2224
2225         atomic_set(&rth->dst.__refcnt, 1);
2226         rth->dst.flags= DST_HOST;
2227         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2228                 rth->dst.flags |= DST_NOPOLICY;
2229         rth->fl.fl4_dst = daddr;
2230         rth->rt_dst     = daddr;
2231         rth->fl.fl4_tos = tos;
2232         rth->fl.mark    = skb->mark;
2233         rth->fl.fl4_src = saddr;
2234         rth->rt_src     = saddr;
2235 #ifdef CONFIG_IP_ROUTE_CLASSID
2236         rth->dst.tclassid = itag;
2237 #endif
2238         rth->rt_iif     =
2239         rth->fl.iif     = dev->ifindex;
2240         rth->dst.dev    = net->loopback_dev;
2241         dev_hold(rth->dst.dev);
2242         rth->rt_gateway = daddr;
2243         rth->rt_spec_dst= spec_dst;
2244         rth->dst.input= ip_local_deliver;
2245         rth->rt_flags   = flags|RTCF_LOCAL;
2246         if (res.type == RTN_UNREACHABLE) {
2247                 rth->dst.input= ip_error;
2248                 rth->dst.error= -err;
2249                 rth->rt_flags   &= ~RTCF_LOCAL;
2250         }
2251         rth->rt_type    = res.type;
2252         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2253         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2254         goto out;
2255
2256 no_route:
2257         RT_CACHE_STAT_INC(in_no_route);
2258         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2259         res.type = RTN_UNREACHABLE;
2260         if (err == -ESRCH)
2261                 err = -ENETUNREACH;
2262         goto local_input;
2263
2264         /*
2265          *      Do not cache martian addresses: they should be logged (RFC1812)
2266          */
2267 martian_destination:
2268         RT_CACHE_STAT_INC(in_martian_dst);
2269 #ifdef CONFIG_IP_ROUTE_VERBOSE
2270         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2271                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2272                         &daddr, &saddr, dev->name);
2273 #endif
2274
2275 e_hostunreach:
2276         err = -EHOSTUNREACH;
2277         goto out;
2278
2279 e_inval:
2280         err = -EINVAL;
2281         goto out;
2282
2283 e_nobufs:
2284         err = -ENOBUFS;
2285         goto out;
2286
2287 martian_source:
2288         err = -EINVAL;
2289 martian_source_keep_err:
2290         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2291         goto out;
2292 }
2293
2294 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2295                            u8 tos, struct net_device *dev, bool noref)
2296 {
2297         struct rtable * rth;
2298         unsigned        hash;
2299         int iif = dev->ifindex;
2300         struct net *net;
2301         int res;
2302
2303         net = dev_net(dev);
2304
2305         rcu_read_lock();
2306
2307         if (!rt_caching(net))
2308                 goto skip_cache;
2309
2310         tos &= IPTOS_RT_MASK;
2311         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2312
2313         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2314              rth = rcu_dereference(rth->dst.rt_next)) {
2315                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2316                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2317                      (rth->fl.iif ^ iif) |
2318                      rth->fl.oif |
2319                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2320                     rth->fl.mark == skb->mark &&
2321                     net_eq(dev_net(rth->dst.dev), net) &&
2322                     !rt_is_expired(rth)) {
2323                         if (noref) {
2324                                 dst_use_noref(&rth->dst, jiffies);
2325                                 skb_dst_set_noref(skb, &rth->dst);
2326                         } else {
2327                                 dst_use(&rth->dst, jiffies);
2328                                 skb_dst_set(skb, &rth->dst);
2329                         }
2330                         RT_CACHE_STAT_INC(in_hit);
2331                         rcu_read_unlock();
2332                         return 0;
2333                 }
2334                 RT_CACHE_STAT_INC(in_hlist_search);
2335         }
2336
2337 skip_cache:
2338         /* Multicast recognition logic is moved from route cache to here.
2339            The problem was that too many Ethernet cards have broken/missing
2340            hardware multicast filters :-( As result the host on multicasting
2341            network acquires a lot of useless route cache entries, sort of
2342            SDR messages from all the world. Now we try to get rid of them.
2343            Really, provided software IP multicast filter is organized
2344            reasonably (at least, hashed), it does not result in a slowdown
2345            comparing with route cache reject entries.
2346            Note, that multicast routers are not affected, because
2347            route cache entry is created eventually.
2348          */
2349         if (ipv4_is_multicast(daddr)) {
2350                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2351
2352                 if (in_dev) {
2353                         int our = ip_check_mc(in_dev, daddr, saddr,
2354                                               ip_hdr(skb)->protocol);
2355                         if (our
2356 #ifdef CONFIG_IP_MROUTE
2357                                 ||
2358                             (!ipv4_is_local_multicast(daddr) &&
2359                              IN_DEV_MFORWARD(in_dev))
2360 #endif
2361                            ) {
2362                                 int res = ip_route_input_mc(skb, daddr, saddr,
2363                                                             tos, dev, our);
2364                                 rcu_read_unlock();
2365                                 return res;
2366                         }
2367                 }
2368                 rcu_read_unlock();
2369                 return -EINVAL;
2370         }
2371         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2372         rcu_read_unlock();
2373         return res;
2374 }
2375 EXPORT_SYMBOL(ip_route_input_common);
2376
2377 /* called with rcu_read_lock() */
2378 static int __mkroute_output(struct rtable **result,
2379                             struct fib_result *res,
2380                             const struct flowi *fl,
2381                             const struct flowi *oldflp,
2382                             struct net_device *dev_out,
2383                             unsigned flags)
2384 {
2385         struct rtable *rth;
2386         struct in_device *in_dev;
2387         u32 tos = RT_FL_TOS(oldflp);
2388
2389         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2390                 return -EINVAL;
2391
2392         if (ipv4_is_lbcast(fl->fl4_dst))
2393                 res->type = RTN_BROADCAST;
2394         else if (ipv4_is_multicast(fl->fl4_dst))
2395                 res->type = RTN_MULTICAST;
2396         else if (ipv4_is_zeronet(fl->fl4_dst))
2397                 return -EINVAL;
2398
2399         if (dev_out->flags & IFF_LOOPBACK)
2400                 flags |= RTCF_LOCAL;
2401
2402         in_dev = __in_dev_get_rcu(dev_out);
2403         if (!in_dev)
2404                 return -EINVAL;
2405
2406         if (res->type == RTN_BROADCAST) {
2407                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2408                 res->fi = NULL;
2409         } else if (res->type == RTN_MULTICAST) {
2410                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2411                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2412                                  oldflp->proto))
2413                         flags &= ~RTCF_LOCAL;
2414                 /* If multicast route do not exist use
2415                  * default one, but do not gateway in this case.
2416                  * Yes, it is hack.
2417                  */
2418                 if (res->fi && res->prefixlen < 4)
2419                         res->fi = NULL;
2420         }
2421
2422
2423         rth = dst_alloc(&ipv4_dst_ops);
2424         if (!rth)
2425                 return -ENOBUFS;
2426
2427         atomic_set(&rth->dst.__refcnt, 1);
2428         rth->dst.flags= DST_HOST;
2429         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2430                 rth->dst.flags |= DST_NOXFRM;
2431         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2432                 rth->dst.flags |= DST_NOPOLICY;
2433
2434         rth->fl.fl4_dst = oldflp->fl4_dst;
2435         rth->fl.fl4_tos = tos;
2436         rth->fl.fl4_src = oldflp->fl4_src;
2437         rth->fl.oif     = oldflp->oif;
2438         rth->fl.mark    = oldflp->mark;
2439         rth->rt_dst     = fl->fl4_dst;
2440         rth->rt_src     = fl->fl4_src;
2441         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2442         /* get references to the devices that are to be hold by the routing
2443            cache entry */
2444         rth->dst.dev    = dev_out;
2445         dev_hold(dev_out);
2446         rth->rt_gateway = fl->fl4_dst;
2447         rth->rt_spec_dst= fl->fl4_src;
2448
2449         rth->dst.output=ip_output;
2450         rth->dst.obsolete = -1;
2451         rth->rt_genid = rt_genid(dev_net(dev_out));
2452
2453         RT_CACHE_STAT_INC(out_slow_tot);
2454
2455         if (flags & RTCF_LOCAL) {
2456                 rth->dst.input = ip_local_deliver;
2457                 rth->rt_spec_dst = fl->fl4_dst;
2458         }
2459         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2460                 rth->rt_spec_dst = fl->fl4_src;
2461                 if (flags & RTCF_LOCAL &&
2462                     !(dev_out->flags & IFF_LOOPBACK)) {
2463                         rth->dst.output = ip_mc_output;
2464                         RT_CACHE_STAT_INC(out_slow_mc);
2465                 }
2466 #ifdef CONFIG_IP_MROUTE
2467                 if (res->type == RTN_MULTICAST) {
2468                         if (IN_DEV_MFORWARD(in_dev) &&
2469                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2470                                 rth->dst.input = ip_mr_input;
2471                                 rth->dst.output = ip_mc_output;
2472                         }
2473                 }
2474 #endif
2475         }
2476
2477         rt_set_nexthop(rth, res, 0);
2478
2479         rth->rt_flags = flags;
2480         *result = rth;
2481         return 0;
2482 }
2483
2484 /* called with rcu_read_lock() */
2485 static int ip_mkroute_output(struct rtable **rp,
2486                              struct fib_result *res,
2487                              const struct flowi *fl,
2488                              const struct flowi *oldflp,
2489                              struct net_device *dev_out,
2490                              unsigned flags)
2491 {
2492         struct rtable *rth = NULL;
2493         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2494         unsigned hash;
2495         if (err == 0) {
2496                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2497                                rt_genid(dev_net(dev_out)));
2498                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2499         }
2500
2501         return err;
2502 }
2503
2504 /*
2505  * Major route resolver routine.
2506  * called with rcu_read_lock();
2507  */
2508
2509 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2510                                 const struct flowi *oldflp)
2511 {
2512         u32 tos = RT_FL_TOS(oldflp);
2513         struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2514                             .fl4_src = oldflp->fl4_src,
2515                             .fl4_tos = tos & IPTOS_RT_MASK,
2516                             .fl4_scope = ((tos & RTO_ONLINK) ?
2517                                           RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2518                             .mark = oldflp->mark,
2519                             .iif = net->loopback_dev->ifindex,
2520                             .oif = oldflp->oif };
2521         struct fib_result res;
2522         unsigned int flags = 0;
2523         struct net_device *dev_out = NULL;
2524         int err;
2525
2526
2527         res.fi          = NULL;
2528 #ifdef CONFIG_IP_MULTIPLE_TABLES
2529         res.r           = NULL;
2530 #endif
2531
2532         if (oldflp->fl4_src) {
2533                 err = -EINVAL;
2534                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2535                     ipv4_is_lbcast(oldflp->fl4_src) ||
2536                     ipv4_is_zeronet(oldflp->fl4_src))
2537                         goto out;
2538
2539                 /* I removed check for oif == dev_out->oif here.
2540                    It was wrong for two reasons:
2541                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2542                       is assigned to multiple interfaces.
2543                    2. Moreover, we are allowed to send packets with saddr
2544                       of another iface. --ANK
2545                  */
2546
2547                 if (oldflp->oif == 0 &&
2548                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2549                      ipv4_is_lbcast(oldflp->fl4_dst))) {
2550                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2551                         dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2552                         if (dev_out == NULL)
2553                                 goto out;
2554
2555                         /* Special hack: user can direct multicasts
2556                            and limited broadcast via necessary interface
2557                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2558                            This hack is not just for fun, it allows
2559                            vic,vat and friends to work.
2560                            They bind socket to loopback, set ttl to zero
2561                            and expect that it will work.
2562                            From the viewpoint of routing cache they are broken,
2563                            because we are not allowed to build multicast path
2564                            with loopback source addr (look, routing cache
2565                            cannot know, that ttl is zero, so that packet
2566                            will not leave this host and route is valid).
2567                            Luckily, this hack is good workaround.
2568                          */
2569
2570                         fl.oif = dev_out->ifindex;
2571                         goto make_route;
2572                 }
2573
2574                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2575                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2576                         if (!__ip_dev_find(net, oldflp->fl4_src, false))
2577                                 goto out;
2578                 }
2579         }
2580
2581
2582         if (oldflp->oif) {
2583                 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2584                 err = -ENODEV;
2585                 if (dev_out == NULL)
2586                         goto out;
2587
2588                 /* RACE: Check return value of inet_select_addr instead. */
2589                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2590                         err = -ENETUNREACH;
2591                         goto out;
2592                 }
2593                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2594                     ipv4_is_lbcast(oldflp->fl4_dst)) {
2595                         if (!fl.fl4_src)
2596                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2597                                                               RT_SCOPE_LINK);
2598                         goto make_route;
2599                 }
2600                 if (!fl.fl4_src) {
2601                         if (ipv4_is_multicast(oldflp->fl4_dst))
2602                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2603                                                               fl.fl4_scope);
2604                         else if (!oldflp->fl4_dst)
2605                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2606                                                               RT_SCOPE_HOST);
2607                 }
2608         }
2609
2610         if (!fl.fl4_dst) {
2611                 fl.fl4_dst = fl.fl4_src;
2612                 if (!fl.fl4_dst)
2613                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2614                 dev_out = net->loopback_dev;
2615                 fl.oif = net->loopback_dev->ifindex;
2616                 res.type = RTN_LOCAL;
2617                 flags |= RTCF_LOCAL;
2618                 goto make_route;
2619         }
2620
2621         if (fib_lookup(net, &fl, &res)) {
2622                 res.fi = NULL;
2623                 if (oldflp->oif) {
2624                         /* Apparently, routing tables are wrong. Assume,
2625                            that the destination is on link.
2626
2627                            WHY? DW.
2628                            Because we are allowed to send to iface
2629                            even if it has NO routes and NO assigned
2630                            addresses. When oif is specified, routing
2631                            tables are looked up with only one purpose:
2632                            to catch if destination is gatewayed, rather than
2633                            direct. Moreover, if MSG_DONTROUTE is set,
2634                            we send packet, ignoring both routing tables
2635                            and ifaddr state. --ANK
2636
2637
2638                            We could make it even if oif is unknown,
2639                            likely IPv6, but we do not.
2640                          */
2641
2642                         if (fl.fl4_src == 0)
2643                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2644                                                               RT_SCOPE_LINK);
2645                         res.type = RTN_UNICAST;
2646                         goto make_route;
2647                 }
2648                 err = -ENETUNREACH;
2649                 goto out;
2650         }
2651
2652         if (res.type == RTN_LOCAL) {
2653                 if (!fl.fl4_src) {
2654                         if (res.fi->fib_prefsrc)
2655                                 fl.fl4_src = res.fi->fib_prefsrc;
2656                         else
2657                                 fl.fl4_src = fl.fl4_dst;
2658                 }
2659                 dev_out = net->loopback_dev;
2660                 fl.oif = dev_out->ifindex;
2661                 res.fi = NULL;
2662                 flags |= RTCF_LOCAL;
2663                 goto make_route;
2664         }
2665
2666 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2667         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2668                 fib_select_multipath(&fl, &res);
2669         else
2670 #endif
2671         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2672                 fib_select_default(&res);
2673
2674         if (!fl.fl4_src)
2675                 fl.fl4_src = FIB_RES_PREFSRC(res);
2676
2677         dev_out = FIB_RES_DEV(res);
2678         fl.oif = dev_out->ifindex;
2679
2680
2681 make_route:
2682         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2683
2684 out:    return err;
2685 }
2686
2687 int __ip_route_output_key(struct net *net, struct rtable **rp,
2688                           const struct flowi *flp)
2689 {
2690         unsigned int hash;
2691         int res;
2692         struct rtable *rth;
2693
2694         if (!rt_caching(net))
2695                 goto slow_output;
2696
2697         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2698
2699         rcu_read_lock_bh();
2700         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2701                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2702                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2703                     rth->fl.fl4_src == flp->fl4_src &&
2704                     rt_is_output_route(rth) &&
2705                     rth->fl.oif == flp->oif &&
2706                     rth->fl.mark == flp->mark &&
2707                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2708                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2709                     net_eq(dev_net(rth->dst.dev), net) &&
2710                     !rt_is_expired(rth)) {
2711                         dst_use(&rth->dst, jiffies);
2712                         RT_CACHE_STAT_INC(out_hit);
2713                         rcu_read_unlock_bh();
2714                         *rp = rth;
2715                         return 0;
2716                 }
2717                 RT_CACHE_STAT_INC(out_hlist_search);
2718         }
2719         rcu_read_unlock_bh();
2720
2721 slow_output:
2722         rcu_read_lock();
2723         res = ip_route_output_slow(net, rp, flp);
2724         rcu_read_unlock();
2725         return res;
2726 }
2727 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2728
2729 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2730 {
2731         return NULL;
2732 }
2733
2734 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2735 {
2736         return 0;
2737 }
2738
2739 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2740 {
2741 }
2742
2743 static struct dst_ops ipv4_dst_blackhole_ops = {
2744         .family                 =       AF_INET,
2745         .protocol               =       cpu_to_be16(ETH_P_IP),
2746         .destroy                =       ipv4_dst_destroy,
2747         .check                  =       ipv4_blackhole_dst_check,
2748         .default_mtu            =       ipv4_blackhole_default_mtu,
2749         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2750 };
2751
2752
2753 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2754 {
2755         struct rtable *ort = *rp;
2756         struct rtable *rt = (struct rtable *)
2757                 dst_alloc(&ipv4_dst_blackhole_ops);
2758
2759         if (rt) {
2760                 struct dst_entry *new = &rt->dst;
2761
2762                 atomic_set(&new->__refcnt, 1);
2763                 new->__use = 1;
2764                 new->input = dst_discard;
2765                 new->output = dst_discard;
2766                 dst_copy_metrics(new, &ort->dst);
2767
2768                 new->dev = ort->dst.dev;
2769                 if (new->dev)
2770                         dev_hold(new->dev);
2771
2772                 rt->fl = ort->fl;
2773
2774                 rt->rt_genid = rt_genid(net);
2775                 rt->rt_flags = ort->rt_flags;
2776                 rt->rt_type = ort->rt_type;
2777                 rt->rt_dst = ort->rt_dst;
2778                 rt->rt_src = ort->rt_src;
2779                 rt->rt_iif = ort->rt_iif;
2780                 rt->rt_gateway = ort->rt_gateway;
2781                 rt->rt_spec_dst = ort->rt_spec_dst;
2782                 rt->peer = ort->peer;
2783                 if (rt->peer)
2784                         atomic_inc(&rt->peer->refcnt);
2785                 rt->fi = ort->fi;
2786                 if (rt->fi)
2787                         atomic_inc(&rt->fi->fib_clntref);
2788
2789                 dst_free(new);
2790         }
2791
2792         dst_release(&(*rp)->dst);
2793         *rp = rt;
2794         return rt ? 0 : -ENOMEM;
2795 }
2796
2797 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2798                          struct sock *sk, int flags)
2799 {
2800         int err;
2801
2802         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2803                 return err;
2804
2805         if (flp->proto) {
2806                 if (!flp->fl4_src)
2807                         flp->fl4_src = (*rp)->rt_src;
2808                 if (!flp->fl4_dst)
2809                         flp->fl4_dst = (*rp)->rt_dst;
2810                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2811                                     flags ? XFRM_LOOKUP_WAIT : 0);
2812                 if (err == -EREMOTE)
2813                         err = ipv4_dst_blackhole(net, rp, flp);
2814
2815                 return err;
2816         }
2817
2818         return 0;
2819 }
2820 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2821
2822 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2823 {
2824         return ip_route_output_flow(net, rp, flp, NULL, 0);
2825 }
2826 EXPORT_SYMBOL(ip_route_output_key);
2827
2828 static int rt_fill_info(struct net *net,
2829                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2830                         int nowait, unsigned int flags)
2831 {
2832         struct rtable *rt = skb_rtable(skb);
2833         struct rtmsg *r;
2834         struct nlmsghdr *nlh;
2835         long expires;
2836         u32 id = 0, ts = 0, tsage = 0, error;
2837
2838         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2839         if (nlh == NULL)
2840                 return -EMSGSIZE;
2841
2842         r = nlmsg_data(nlh);
2843         r->rtm_family    = AF_INET;
2844         r->rtm_dst_len  = 32;
2845         r->rtm_src_len  = 0;
2846         r->rtm_tos      = rt->fl.fl4_tos;
2847         r->rtm_table    = RT_TABLE_MAIN;
2848         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2849         r->rtm_type     = rt->rt_type;
2850         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2851         r->rtm_protocol = RTPROT_UNSPEC;
2852         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2853         if (rt->rt_flags & RTCF_NOTIFY)
2854                 r->rtm_flags |= RTM_F_NOTIFY;
2855
2856         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2857
2858         if (rt->fl.fl4_src) {
2859                 r->rtm_src_len = 32;
2860                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2861         }
2862         if (rt->dst.dev)
2863                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2864 #ifdef CONFIG_IP_ROUTE_CLASSID
2865         if (rt->dst.tclassid)
2866                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2867 #endif
2868         if (rt_is_input_route(rt))
2869                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2870         else if (rt->rt_src != rt->fl.fl4_src)
2871                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2872
2873         if (rt->rt_dst != rt->rt_gateway)
2874                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2875
2876         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2877                 goto nla_put_failure;
2878
2879         if (rt->fl.mark)
2880                 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2881
2882         error = rt->dst.error;
2883         expires = (rt->peer && rt->peer->pmtu_expires) ?
2884                 rt->peer->pmtu_expires - jiffies : 0;
2885         if (rt->peer) {
2886                 inet_peer_refcheck(rt->peer);
2887                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2888                 if (rt->peer->tcp_ts_stamp) {
2889                         ts = rt->peer->tcp_ts;
2890                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2891                 }
2892         }
2893
2894         if (rt_is_input_route(rt)) {
2895 #ifdef CONFIG_IP_MROUTE
2896                 __be32 dst = rt->rt_dst;
2897
2898                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2899                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2900                         int err = ipmr_get_route(net, skb, r, nowait);
2901                         if (err <= 0) {
2902                                 if (!nowait) {
2903                                         if (err == 0)
2904                                                 return 0;
2905                                         goto nla_put_failure;
2906                                 } else {
2907                                         if (err == -EMSGSIZE)
2908                                                 goto nla_put_failure;
2909                                         error = err;
2910                                 }
2911                         }
2912                 } else
2913 #endif
2914                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2915         }
2916
2917         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2918                                expires, error) < 0)
2919                 goto nla_put_failure;
2920
2921         return nlmsg_end(skb, nlh);
2922
2923 nla_put_failure:
2924         nlmsg_cancel(skb, nlh);
2925         return -EMSGSIZE;
2926 }
2927
2928 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2929 {
2930         struct net *net = sock_net(in_skb->sk);
2931         struct rtmsg *rtm;
2932         struct nlattr *tb[RTA_MAX+1];
2933         struct rtable *rt = NULL;
2934         __be32 dst = 0;
2935         __be32 src = 0;
2936         u32 iif;
2937         int err;
2938         int mark;
2939         struct sk_buff *skb;
2940
2941         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2942         if (err < 0)
2943                 goto errout;
2944
2945         rtm = nlmsg_data(nlh);
2946
2947         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2948         if (skb == NULL) {
2949                 err = -ENOBUFS;
2950                 goto errout;
2951         }
2952
2953         /* Reserve room for dummy headers, this skb can pass
2954            through good chunk of routing engine.
2955          */
2956         skb_reset_mac_header(skb);
2957         skb_reset_network_header(skb);
2958
2959         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2960         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2961         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2962
2963         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2964         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2965         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2966         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2967
2968         if (iif) {
2969                 struct net_device *dev;
2970
2971                 dev = __dev_get_by_index(net, iif);
2972                 if (dev == NULL) {
2973                         err = -ENODEV;
2974                         goto errout_free;
2975                 }
2976
2977                 skb->protocol   = htons(ETH_P_IP);
2978                 skb->dev        = dev;
2979                 skb->mark       = mark;
2980                 local_bh_disable();
2981                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2982                 local_bh_enable();
2983
2984                 rt = skb_rtable(skb);
2985                 if (err == 0 && rt->dst.error)
2986                         err = -rt->dst.error;
2987         } else {
2988                 struct flowi fl = {
2989                         .fl4_dst = dst,
2990                         .fl4_src = src,
2991                         .fl4_tos = rtm->rtm_tos,
2992                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2993                         .mark = mark,
2994                 };
2995                 err = ip_route_output_key(net, &rt, &fl);
2996         }
2997
2998         if (err)
2999                 goto errout_free;
3000
3001         skb_dst_set(skb, &rt->dst);
3002         if (rtm->rtm_flags & RTM_F_NOTIFY)
3003                 rt->rt_flags |= RTCF_NOTIFY;
3004
3005         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3006                            RTM_NEWROUTE, 0, 0);
3007         if (err <= 0)
3008                 goto errout_free;
3009
3010         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3011 errout:
3012         return err;
3013
3014 errout_free:
3015         kfree_skb(skb);
3016         goto errout;
3017 }
3018
3019 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3020 {
3021         struct rtable *rt;
3022         int h, s_h;
3023         int idx, s_idx;
3024         struct net *net;
3025
3026         net = sock_net(skb->sk);
3027
3028         s_h = cb->args[0];
3029         if (s_h < 0)
3030                 s_h = 0;
3031         s_idx = idx = cb->args[1];
3032         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3033                 if (!rt_hash_table[h].chain)
3034                         continue;
3035                 rcu_read_lock_bh();
3036                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3037                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3038                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3039                                 continue;
3040                         if (rt_is_expired(rt))
3041                                 continue;
3042                         skb_dst_set_noref(skb, &rt->dst);
3043                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3044                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3045                                          1, NLM_F_MULTI) <= 0) {
3046                                 skb_dst_drop(skb);
3047                                 rcu_read_unlock_bh();
3048                                 goto done;
3049                         }
3050                         skb_dst_drop(skb);
3051                 }
3052                 rcu_read_unlock_bh();
3053         }
3054
3055 done:
3056         cb->args[0] = h;
3057         cb->args[1] = idx;
3058         return skb->len;
3059 }
3060
3061 void ip_rt_multicast_event(struct in_device *in_dev)
3062 {
3063         rt_cache_flush(dev_net(in_dev->dev), 0);
3064 }
3065
3066 #ifdef CONFIG_SYSCTL
3067 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3068                                         void __user *buffer,
3069                                         size_t *lenp, loff_t *ppos)
3070 {
3071         if (write) {
3072                 int flush_delay;
3073                 ctl_table ctl;
3074                 struct net *net;
3075
3076                 memcpy(&ctl, __ctl, sizeof(ctl));
3077                 ctl.data = &flush_delay;
3078                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3079
3080                 net = (struct net *)__ctl->extra1;
3081                 rt_cache_flush(net, flush_delay);
3082                 return 0;
3083         }
3084
3085         return -EINVAL;
3086 }
3087
3088 static ctl_table ipv4_route_table[] = {
3089         {
3090                 .procname       = "gc_thresh",
3091                 .data           = &ipv4_dst_ops.gc_thresh,
3092                 .maxlen         = sizeof(int),
3093                 .mode           = 0644,
3094                 .proc_handler   = proc_dointvec,
3095         },
3096         {
3097                 .procname       = "max_size",
3098                 .data           = &ip_rt_max_size,
3099                 .maxlen         = sizeof(int),
3100                 .mode           = 0644,
3101                 .proc_handler   = proc_dointvec,
3102         },
3103         {
3104                 /*  Deprecated. Use gc_min_interval_ms */
3105
3106                 .procname       = "gc_min_interval",
3107                 .data           = &ip_rt_gc_min_interval,
3108                 .maxlen         = sizeof(int),
3109                 .mode           = 0644,
3110                 .proc_handler   = proc_dointvec_jiffies,
3111         },
3112         {
3113                 .procname       = "gc_min_interval_ms",
3114                 .data           = &ip_rt_gc_min_interval,
3115                 .maxlen         = sizeof(int),
3116                 .mode           = 0644,
3117                 .proc_handler   = proc_dointvec_ms_jiffies,
3118         },
3119         {
3120                 .procname       = "gc_timeout",
3121                 .data           = &ip_rt_gc_timeout,
3122                 .maxlen         = sizeof(int),
3123                 .mode           = 0644,
3124                 .proc_handler   = proc_dointvec_jiffies,
3125         },
3126         {
3127                 .procname       = "gc_interval",
3128                 .data           = &ip_rt_gc_interval,
3129                 .maxlen         = sizeof(int),
3130                 .mode           = 0644,
3131                 .proc_handler   = proc_dointvec_jiffies,
3132         },
3133         {
3134                 .procname       = "redirect_load",
3135                 .data           = &ip_rt_redirect_load,
3136                 .maxlen         = sizeof(int),
3137                 .mode           = 0644,
3138                 .proc_handler   = proc_dointvec,
3139         },
3140         {
3141                 .procname       = "redirect_number",
3142                 .data           = &ip_rt_redirect_number,
3143                 .maxlen         = sizeof(int),
3144                 .mode           = 0644,
3145                 .proc_handler   = proc_dointvec,
3146         },
3147         {
3148                 .procname       = "redirect_silence",
3149                 .data           = &ip_rt_redirect_silence,
3150                 .maxlen         = sizeof(int),
3151                 .mode           = 0644,
3152                 .proc_handler   = proc_dointvec,
3153         },
3154         {
3155                 .procname       = "error_cost",
3156                 .data           = &ip_rt_error_cost,
3157                 .maxlen         = sizeof(int),
3158                 .mode           = 0644,
3159                 .proc_handler   = proc_dointvec,
3160         },
3161         {
3162                 .procname       = "error_burst",
3163                 .data           = &ip_rt_error_burst,
3164                 .maxlen         = sizeof(int),
3165                 .mode           = 0644,
3166                 .proc_handler   = proc_dointvec,
3167         },
3168         {
3169                 .procname       = "gc_elasticity",
3170                 .data           = &ip_rt_gc_elasticity,
3171                 .maxlen         = sizeof(int),
3172                 .mode           = 0644,
3173                 .proc_handler   = proc_dointvec,
3174         },
3175         {
3176                 .procname       = "mtu_expires",
3177                 .data           = &ip_rt_mtu_expires,
3178                 .maxlen         = sizeof(int),
3179                 .mode           = 0644,
3180                 .proc_handler   = proc_dointvec_jiffies,
3181         },
3182         {
3183                 .procname       = "min_pmtu",
3184                 .data           = &ip_rt_min_pmtu,
3185                 .maxlen         = sizeof(int),
3186                 .mode           = 0644,
3187                 .proc_handler   = proc_dointvec,
3188         },
3189         {
3190                 .procname       = "min_adv_mss",
3191                 .data           = &ip_rt_min_advmss,
3192                 .maxlen         = sizeof(int),
3193                 .mode           = 0644,
3194                 .proc_handler   = proc_dointvec,
3195         },
3196         { }
3197 };
3198
3199 static struct ctl_table empty[1];
3200
3201 static struct ctl_table ipv4_skeleton[] =
3202 {
3203         { .procname = "route",
3204           .mode = 0555, .child = ipv4_route_table},
3205         { .procname = "neigh",
3206           .mode = 0555, .child = empty},
3207         { }
3208 };
3209
3210 static __net_initdata struct ctl_path ipv4_path[] = {
3211         { .procname = "net", },
3212         { .procname = "ipv4", },
3213         { },
3214 };
3215
3216 static struct ctl_table ipv4_route_flush_table[] = {
3217         {
3218                 .procname       = "flush",
3219                 .maxlen         = sizeof(int),
3220                 .mode           = 0200,
3221                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3222         },
3223         { },
3224 };
3225
3226 static __net_initdata struct ctl_path ipv4_route_path[] = {
3227         { .procname = "net", },
3228         { .procname = "ipv4", },
3229         { .procname = "route", },
3230         { },
3231 };
3232
3233 static __net_init int sysctl_route_net_init(struct net *net)
3234 {
3235         struct ctl_table *tbl;
3236
3237         tbl = ipv4_route_flush_table;
3238         if (!net_eq(net, &init_net)) {
3239                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3240                 if (tbl == NULL)
3241                         goto err_dup;
3242         }
3243         tbl[0].extra1 = net;
3244
3245         net->ipv4.route_hdr =
3246                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3247         if (net->ipv4.route_hdr == NULL)
3248                 goto err_reg;
3249         return 0;
3250
3251 err_reg:
3252         if (tbl != ipv4_route_flush_table)
3253                 kfree(tbl);
3254 err_dup:
3255         return -ENOMEM;
3256 }
3257
3258 static __net_exit void sysctl_route_net_exit(struct net *net)
3259 {
3260         struct ctl_table *tbl;
3261
3262         tbl = net->ipv4.route_hdr->ctl_table_arg;
3263         unregister_net_sysctl_table(net->ipv4.route_hdr);
3264         BUG_ON(tbl == ipv4_route_flush_table);
3265         kfree(tbl);
3266 }
3267
3268 static __net_initdata struct pernet_operations sysctl_route_ops = {
3269         .init = sysctl_route_net_init,
3270         .exit = sysctl_route_net_exit,
3271 };
3272 #endif
3273
3274 static __net_init int rt_genid_init(struct net *net)
3275 {
3276         get_random_bytes(&net->ipv4.rt_genid,
3277                          sizeof(net->ipv4.rt_genid));
3278         return 0;
3279 }
3280
3281 static __net_initdata struct pernet_operations rt_genid_ops = {
3282         .init = rt_genid_init,
3283 };
3284
3285
3286 #ifdef CONFIG_IP_ROUTE_CLASSID
3287 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3288 #endif /* CONFIG_IP_ROUTE_CLASSID */
3289
3290 static __initdata unsigned long rhash_entries;
3291 static int __init set_rhash_entries(char *str)
3292 {
3293         if (!str)
3294                 return 0;
3295         rhash_entries = simple_strtoul(str, &str, 0);
3296         return 1;
3297 }
3298 __setup("rhash_entries=", set_rhash_entries);
3299
3300 int __init ip_rt_init(void)
3301 {
3302         int rc = 0;
3303
3304 #ifdef CONFIG_IP_ROUTE_CLASSID
3305         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3306         if (!ip_rt_acct)
3307                 panic("IP: failed to allocate ip_rt_acct\n");
3308 #endif
3309
3310         ipv4_dst_ops.kmem_cachep =
3311                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3312                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3313
3314         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3315
3316         if (dst_entries_init(&ipv4_dst_ops) < 0)
3317                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3318
3319         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3320                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3321
3322         rt_hash_table = (struct rt_hash_bucket *)
3323                 alloc_large_system_hash("IP route cache",
3324                                         sizeof(struct rt_hash_bucket),
3325                                         rhash_entries,
3326                                         (totalram_pages >= 128 * 1024) ?
3327                                         15 : 17,
3328                                         0,
3329                                         &rt_hash_log,
3330                                         &rt_hash_mask,
3331                                         rhash_entries ? 0 : 512 * 1024);
3332         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3333         rt_hash_lock_init();
3334
3335         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3336         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3337
3338         devinet_init();
3339         ip_fib_init();
3340
3341         if (ip_rt_proc_init())
3342                 printk(KERN_ERR "Unable to create route proc files\n");
3343 #ifdef CONFIG_XFRM
3344         xfrm_init();
3345         xfrm4_init(ip_rt_max_size);
3346 #endif
3347         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3348
3349 #ifdef CONFIG_SYSCTL
3350         register_pernet_subsys(&sysctl_route_ops);
3351 #endif
3352         register_pernet_subsys(&rt_genid_ops);
3353         return rc;
3354 }
3355
3356 #ifdef CONFIG_SYSCTL
3357 /*
3358  * We really need to sanitize the damn ipv4 init order, then all
3359  * this nonsense will go away.
3360  */
3361 void __init ip_static_sysctl_init(void)
3362 {
3363         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3364 }
3365 #endif