net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *      Eric Dumazet            :       hashed spinlocks
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/config.h>
  66 #include <linux/module.h>
  67 #include <asm/uaccess.h>
  68 #include <asm/system.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/sched.h>
  73 #include <linux/mm.h>
  74 #include <linux/bootmem.h>
  75 #include <linux/string.h>
  76 #include <linux/socket.h>
  77 #include <linux/sockios.h>
  78 #include <linux/errno.h>
  79 #include <linux/in.h>
  80 #include <linux/inet.h>
  81 #include <linux/netdevice.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/init.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/rtnetlink.h>
  86 #include <linux/inetdevice.h>
  87 #include <linux/igmp.h>
  88 #include <linux/pkt_sched.h>
  89 #include <linux/mroute.h>
  90 #include <linux/netfilter_ipv4.h>
  91 #include <linux/random.h>
  92 #include <linux/jhash.h>
  93 #include <linux/rcupdate.h>
  94 #include <linux/times.h>
  95 #include <net/protocol.h>
  96 #include <net/ip.h>
  97 #include <net/route.h>
  98 #include <net/inetpeer.h>
  99 #include <net/sock.h>
 100 #include <net/ip_fib.h>
 101 #include <net/arp.h>
 102 #include <net/tcp.h>
 103 #include <net/icmp.h>
 104 #include <net/xfrm.h>
 105 #include <net/ip_mp_alg.h>
 106 #ifdef CONFIG_SYSCTL
 107 #include <linux/sysctl.h>
 108 #endif
 109
 110 #define RT_FL_TOS(oldflp) \
 111     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 112
 113 #define IP_MAX_MTU      0xFFF0
 114
 115 #define RT_GC_TIMEOUT (300*HZ)
 116
 117 static int ip_rt_min_delay              = 2 * HZ;
 118 static int ip_rt_max_delay              = 10 * HZ;
 119 static int ip_rt_max_size;
 120 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 121 static int ip_rt_gc_interval            = 60 * HZ;
 122 static int ip_rt_gc_min_interval        = HZ / 2;
 123 static int ip_rt_redirect_number        = 9;
 124 static int ip_rt_redirect_load          = HZ / 50;
 125 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost             = HZ;
 127 static int ip_rt_error_burst            = 5 * HZ;
 128 static int ip_rt_gc_elasticity          = 8;
 129 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 130 static int ip_rt_min_pmtu               = 512 + 20 + 20;
 131 static int ip_rt_min_advmss             = 256;
 132 static int ip_rt_secret_interval        = 10 * 60 * HZ;
 133 static unsigned long rt_deadline;
 134
 135 #define RTprint(a...)   printk(KERN_DEBUG a)
 136
 137 static struct timer_list rt_flush_timer;
 138 static struct timer_list rt_periodic_timer;
 139 static struct timer_list rt_secret_timer;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static void              ipv4_dst_destroy(struct dst_entry *dst);
 147 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 148                                          struct net_device *dev, int how);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(void);
 153
 154
 155 static struct dst_ops ipv4_dst_ops = {
 156         .family =               AF_INET,
 157         .protocol =             __constant_htons(ETH_P_IP),
 158         .gc =                   rt_garbage_collect,
 159         .check =                ipv4_dst_check,
 160         .destroy =              ipv4_dst_destroy,
 161         .ifdown =               ipv4_dst_ifdown,
 162         .negative_advice =      ipv4_negative_advice,
 163         .link_failure =         ipv4_link_failure,
 164         .update_pmtu =          ip_rt_update_pmtu,
 165         .entry_size =           sizeof(struct rtable),
 166 };
 167
 168 #define ECN_OR_COST(class)      TC_PRIO_##class
 169
 170 __u8 ip_tos2prio[16] = {
 171         TC_PRIO_BESTEFFORT,
 172         ECN_OR_COST(FILLER),
 173         TC_PRIO_BESTEFFORT,
 174         ECN_OR_COST(BESTEFFORT),
 175         TC_PRIO_BULK,
 176         ECN_OR_COST(BULK),
 177         TC_PRIO_BULK,
 178         ECN_OR_COST(BULK),
 179         TC_PRIO_INTERACTIVE,
 180         ECN_OR_COST(INTERACTIVE),
 181         TC_PRIO_INTERACTIVE,
 182         ECN_OR_COST(INTERACTIVE),
 183         TC_PRIO_INTERACTIVE_BULK,
 184         ECN_OR_COST(INTERACTIVE_BULK),
 185         TC_PRIO_INTERACTIVE_BULK,
 186         ECN_OR_COST(INTERACTIVE_BULK)
 187 };
 188
 189
 190 /*
 191  * Route cache.
 192  */
 193
 194 /* The locking scheme is rather straight forward:
 195  *
 196  * 1) Read-Copy Update protects the buckets of the central route hash.
 197  * 2) Only writers remove entries, and they hold the lock
 198  *    as they look at rtable reference counts.
 199  * 3) Only readers acquire references to rtable entries,
 200  *    they do so with atomic increments and with the
 201  *    lock held.
 202  */
 203
 204 struct rt_hash_bucket {
 205         struct rtable   *chain;
 206 };
 207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 208 /*
 209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 210  * The size of this table is a power of two and depends on the number of CPUS.
 211  */
 212 #if NR_CPUS >= 32
 213 #define RT_HASH_LOCK_SZ 4096
 214 #elif NR_CPUS >= 16
 215 #define RT_HASH_LOCK_SZ 2048
 216 #elif NR_CPUS >= 8
 217 #define RT_HASH_LOCK_SZ 1024
 218 #elif NR_CPUS >= 4
 219 #define RT_HASH_LOCK_SZ 512
 220 #else
 221 #define RT_HASH_LOCK_SZ 256
 222 #endif
 223
 224 static spinlock_t       *rt_hash_locks;
 225 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 226 # define rt_hash_lock_init()    { \
 227                 int i; \
 228                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
 229                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
 230                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
 231                         spin_lock_init(&rt_hash_locks[i]); \
 232                 }
 233 #else
 234 # define rt_hash_lock_addr(slot) NULL
 235 # define rt_hash_lock_init()
 236 #endif
 237
 238 static struct rt_hash_bucket    *rt_hash_table;
 239 static unsigned                 rt_hash_mask;
 240 static int                      rt_hash_log;
 241 static unsigned int             rt_hash_rnd;
 242
 243 struct rt_cache_stat *rt_cache_stat;
 244
 245 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 246                                 struct rtable **res);
 247
 248 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 249 {
 250         return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
 251                 & rt_hash_mask);
 252 }
 253
 254 #ifdef CONFIG_PROC_FS
 255 struct rt_cache_iter_state {
 256         int bucket;
 257 };
 258
 259 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 260 {
 261         struct rtable *r = NULL;
 262         struct rt_cache_iter_state *st = seq->private;
 263
 264         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 265                 rcu_read_lock_bh();
 266                 r = rt_hash_table[st->bucket].chain;
 267                 if (r)
 268                         break;
 269                 rcu_read_unlock_bh();
 270         }
 271         return r;
 272 }
 273
 274 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 275 {
 276         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
 277
 278         r = r->u.rt_next;
 279         while (!r) {
 280                 rcu_read_unlock_bh();
 281                 if (--st->bucket < 0)
 282                         break;
 283                 rcu_read_lock_bh();
 284                 r = rt_hash_table[st->bucket].chain;
 285         }
 286         return r;
 287 }
 288
 289 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 290 {
 291         struct rtable *r = rt_cache_get_first(seq);
 292
 293         if (r)
 294                 while (pos && (r = rt_cache_get_next(seq, r)))
 295                         --pos;
 296         return pos ? NULL : r;
 297 }
 298
 299 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 300 {
 301         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 302 }
 303
 304 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 305 {
 306         struct rtable *r = NULL;
 307
 308         if (v == SEQ_START_TOKEN)
 309                 r = rt_cache_get_first(seq);
 310         else
 311                 r = rt_cache_get_next(seq, v);
 312         ++*pos;
 313         return r;
 314 }
 315
 316 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 317 {
 318         if (v && v != SEQ_START_TOKEN)
 319                 rcu_read_unlock_bh();
 320 }
 321
 322 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 323 {
 324         if (v == SEQ_START_TOKEN)
 325                 seq_printf(seq, "%-127s\n",
 326                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 327                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 328                            "HHUptod\tSpecDst");
 329         else {
 330                 struct rtable *r = v;
 331                 char temp[256];
 332
 333                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 334                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 335                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 336                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 337                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 338                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 339                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 340                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 341                         dst_metric(&r->u.dst, RTAX_WINDOW),
 342                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 343                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 344                         r->fl.fl4_tos,
 345                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 346                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 347                                        dev_queue_xmit) : 0,
 348                         r->rt_spec_dst);
 349                 seq_printf(seq, "%-127s\n", temp);
 350         }
 351         return 0;
 352 }
 353
 354 static struct seq_operations rt_cache_seq_ops = {
 355         .start  = rt_cache_seq_start,
 356         .next   = rt_cache_seq_next,
 357         .stop   = rt_cache_seq_stop,
 358         .show   = rt_cache_seq_show,
 359 };
 360
 361 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 362 {
 363         struct seq_file *seq;
 364         int rc = -ENOMEM;
 365         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
 366
 367         if (!s)
 368                 goto out;
 369         rc = seq_open(file, &rt_cache_seq_ops);
 370         if (rc)
 371                 goto out_kfree;
 372         seq          = file->private_data;
 373         seq->private = s;
 374         memset(s, 0, sizeof(*s));
 375 out:
 376         return rc;
 377 out_kfree:
 378         kfree(s);
 379         goto out;
 380 }
 381
 382 static struct file_operations rt_cache_seq_fops = {
 383         .owner   = THIS_MODULE,
 384         .open    = rt_cache_seq_open,
 385         .read    = seq_read,
 386         .llseek  = seq_lseek,
 387         .release = seq_release_private,
 388 };
 389
 390
 391 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 392 {
 393         int cpu;
 394
 395         if (*pos == 0)
 396                 return SEQ_START_TOKEN;
 397
 398         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 399                 if (!cpu_possible(cpu))
 400                         continue;
 401                 *pos = cpu+1;
 402                 return per_cpu_ptr(rt_cache_stat, cpu);
 403         }
 404         return NULL;
 405 }
 406
 407 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 408 {
 409         int cpu;
 410
 411         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 412                 if (!cpu_possible(cpu))
 413                         continue;
 414                 *pos = cpu+1;
 415                 return per_cpu_ptr(rt_cache_stat, cpu);
 416         }
 417         return NULL;
 418
 419 }
 420
 421 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 422 {
 423
 424 }
 425
 426 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 427 {
 428         struct rt_cache_stat *st = v;
 429
 430         if (v == SEQ_START_TOKEN) {
 431                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 432                 return 0;
 433         }
 434
 435         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 436                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 437                    atomic_read(&ipv4_dst_ops.entries),
 438                    st->in_hit,
 439                    st->in_slow_tot,
 440                    st->in_slow_mc,
 441                    st->in_no_route,
 442                    st->in_brd,
 443                    st->in_martian_dst,
 444                    st->in_martian_src,
 445
 446                    st->out_hit,
 447                    st->out_slow_tot,
 448                    st->out_slow_mc,
 449
 450                    st->gc_total,
 451                    st->gc_ignored,
 452                    st->gc_goal_miss,
 453                    st->gc_dst_overflow,
 454                    st->in_hlist_search,
 455                    st->out_hlist_search
 456                 );
 457         return 0;
 458 }
 459
 460 static struct seq_operations rt_cpu_seq_ops = {
 461         .start  = rt_cpu_seq_start,
 462         .next   = rt_cpu_seq_next,
 463         .stop   = rt_cpu_seq_stop,
 464         .show   = rt_cpu_seq_show,
 465 };
 466
 467
 468 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 469 {
 470         return seq_open(file, &rt_cpu_seq_ops);
 471 }
 472
 473 static struct file_operations rt_cpu_seq_fops = {
 474         .owner   = THIS_MODULE,
 475         .open    = rt_cpu_seq_open,
 476         .read    = seq_read,
 477         .llseek  = seq_lseek,
 478         .release = seq_release,
 479 };
 480
 481 #endif /* CONFIG_PROC_FS */
 482
 483 static __inline__ void rt_free(struct rtable *rt)
 484 {
 485         multipath_remove(rt);
 486         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 487 }
 488
 489 static __inline__ void rt_drop(struct rtable *rt)
 490 {
 491         multipath_remove(rt);
 492         ip_rt_put(rt);
 493         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 494 }
 495
 496 static __inline__ int rt_fast_clean(struct rtable *rth)
 497 {
 498         /* Kill broadcast/multicast entries very aggresively, if they
 499            collide in hash table with more useful entries */
 500         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 501                 rth->fl.iif && rth->u.rt_next;
 502 }
 503
 504 static __inline__ int rt_valuable(struct rtable *rth)
 505 {
 506         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 507                 rth->u.dst.expires;
 508 }
 509
 510 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 511 {
 512         unsigned long age;
 513         int ret = 0;
 514
 515         if (atomic_read(&rth->u.dst.__refcnt))
 516                 goto out;
 517
 518         ret = 1;
 519         if (rth->u.dst.expires &&
 520             time_after_eq(jiffies, rth->u.dst.expires))
 521                 goto out;
 522
 523         age = jiffies - rth->u.dst.lastuse;
 524         ret = 0;
 525         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 526             (age <= tmo2 && rt_valuable(rth)))
 527                 goto out;
 528         ret = 1;
 529 out:    return ret;
 530 }
 531
 532 /* Bits of score are:
 533  * 31: very valuable
 534  * 30: not quite useless
 535  * 29..0: usage counter
 536  */
 537 static inline u32 rt_score(struct rtable *rt)
 538 {
 539         u32 score = jiffies - rt->u.dst.lastuse;
 540
 541         score = ~score & ~(3<<30);
 542
 543         if (rt_valuable(rt))
 544                 score |= (1<<31);
 545
 546         if (!rt->fl.iif ||
 547             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 548                 score |= (1<<30);
 549
 550         return score;
 551 }
 552
 553 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 554 {
 555         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
 556                fl1->oif     == fl2->oif &&
 557                fl1->iif     == fl2->iif;
 558 }
 559
 560 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 561 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 562                                                 struct rtable *expentry,
 563                                                 int *removed_count)
 564 {
 565         int passedexpired = 0;
 566         struct rtable **nextstep = NULL;
 567         struct rtable **rthp = chain_head;
 568         struct rtable *rth;
 569
 570         if (removed_count)
 571                 *removed_count = 0;
 572
 573         while ((rth = *rthp) != NULL) {
 574                 if (rth == expentry)
 575                         passedexpired = 1;
 576
 577                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
 578                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
 579                         if (*rthp == expentry) {
 580                                 *rthp = rth->u.rt_next;
 581                                 continue;
 582                         } else {
 583                                 *rthp = rth->u.rt_next;
 584                                 rt_free(rth);
 585                                 if (removed_count)
 586                                         ++(*removed_count);
 587                         }
 588                 } else {
 589                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
 590                             passedexpired && !nextstep)
 591                                 nextstep = &rth->u.rt_next;
 592
 593                         rthp = &rth->u.rt_next;
 594                 }
 595         }
 596
 597         rt_free(expentry);
 598         if (removed_count)
 599                 ++(*removed_count);
 600
 601         return nextstep;
 602 }
 603 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 604
 605
 606 /* This runs via a timer and thus is always in BH context. */
 607 static void rt_check_expire(unsigned long dummy)
 608 {
 609         static int rover;
 610         int i = rover, t;
 611         struct rtable *rth, **rthp;
 612         unsigned long now = jiffies;
 613
 614         for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
 615              t -= ip_rt_gc_timeout) {
 616                 unsigned long tmo = ip_rt_gc_timeout;
 617
 618                 i = (i + 1) & rt_hash_mask;
 619                 rthp = &rt_hash_table[i].chain;
 620
 621                 spin_lock(rt_hash_lock_addr(i));
 622                 while ((rth = *rthp) != NULL) {
 623                         if (rth->u.dst.expires) {
 624                                 /* Entry is expired even if it is in use */
 625                                 if (time_before_eq(now, rth->u.dst.expires)) {
 626                                         tmo >>= 1;
 627                                         rthp = &rth->u.rt_next;
 628                                         continue;
 629                                 }
 630                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 631                                 tmo >>= 1;
 632                                 rthp = &rth->u.rt_next;
 633                                 continue;
 634                         }
 635
 636                         /* Cleanup aged off entries. */
 637 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 638                         /* remove all related balanced entries if necessary */
 639                         if (rth->u.dst.flags & DST_BALANCED) {
 640                                 rthp = rt_remove_balanced_route(
 641                                         &rt_hash_table[i].chain,
 642                                         rth, NULL);
 643                                 if (!rthp)
 644                                         break;
 645                         } else {
 646                                 *rthp = rth->u.rt_next;
 647                                 rt_free(rth);
 648                         }
 649 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 650                         *rthp = rth->u.rt_next;
 651                         rt_free(rth);
 652 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 653                 }
 654                 spin_unlock(rt_hash_lock_addr(i));
 655
 656                 /* Fallback loop breaker. */
 657                 if (time_after(jiffies, now))
 658                         break;
 659         }
 660         rover = i;
 661         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
 662 }
 663
 664 /* This can run from both BH and non-BH contexts, the latter
 665  * in the case of a forced flush event.
 666  */
 667 static void rt_run_flush(unsigned long dummy)
 668 {
 669         int i;
 670         struct rtable *rth, *next;
 671
 672         rt_deadline = 0;
 673
 674         get_random_bytes(&rt_hash_rnd, 4);
 675
 676         for (i = rt_hash_mask; i >= 0; i--) {
 677                 spin_lock_bh(rt_hash_lock_addr(i));
 678                 rth = rt_hash_table[i].chain;
 679                 if (rth)
 680                         rt_hash_table[i].chain = NULL;
 681                 spin_unlock_bh(rt_hash_lock_addr(i));
 682
 683                 for (; rth; rth = next) {
 684                         next = rth->u.rt_next;
 685                         rt_free(rth);
 686                 }
 687         }
 688 }
 689
 690 static DEFINE_SPINLOCK(rt_flush_lock);
 691
 692 void rt_cache_flush(int delay)
 693 {
 694         unsigned long now = jiffies;
 695         int user_mode = !in_softirq();
 696
 697         if (delay < 0)
 698                 delay = ip_rt_min_delay;
 699
 700         /* flush existing multipath state*/
 701         multipath_flush();
 702
 703         spin_lock_bh(&rt_flush_lock);
 704
 705         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 706                 long tmo = (long)(rt_deadline - now);
 707
 708                 /* If flush timer is already running
 709                    and flush request is not immediate (delay > 0):
 710
 711                    if deadline is not achieved, prolongate timer to "delay",
 712                    otherwise fire it at deadline time.
 713                  */
 714
 715                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 716                         tmo = 0;
 717
 718                 if (delay > tmo)
 719                         delay = tmo;
 720         }
 721
 722         if (delay <= 0) {
 723                 spin_unlock_bh(&rt_flush_lock);
 724                 rt_run_flush(0);
 725                 return;
 726         }
 727
 728         if (rt_deadline == 0)
 729                 rt_deadline = now + ip_rt_max_delay;
 730
 731         mod_timer(&rt_flush_timer, now+delay);
 732         spin_unlock_bh(&rt_flush_lock);
 733 }
 734
 735 static void rt_secret_rebuild(unsigned long dummy)
 736 {
 737         unsigned long now = jiffies;
 738
 739         rt_cache_flush(0);
 740         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 741 }
 742
 743 /*
 744    Short description of GC goals.
 745
 746    We want to build algorithm, which will keep routing cache
 747    at some equilibrium point, when number of aged off entries
 748    is kept approximately equal to newly generated ones.
 749
 750    Current expiration strength is variable "expire".
 751    We try to adjust it dynamically, so that if networking
 752    is idle expires is large enough to keep enough of warm entries,
 753    and when load increases it reduces to limit cache size.
 754  */
 755
 756 static int rt_garbage_collect(void)
 757 {
 758         static unsigned long expire = RT_GC_TIMEOUT;
 759         static unsigned long last_gc;
 760         static int rover;
 761         static int equilibrium;
 762         struct rtable *rth, **rthp;
 763         unsigned long now = jiffies;
 764         int goal;
 765
 766         /*
 767          * Garbage collection is pretty expensive,
 768          * do not make it too frequently.
 769          */
 770
 771         RT_CACHE_STAT_INC(gc_total);
 772
 773         if (now - last_gc < ip_rt_gc_min_interval &&
 774             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 775                 RT_CACHE_STAT_INC(gc_ignored);
 776                 goto out;
 777         }
 778
 779         /* Calculate number of entries, which we want to expire now. */
 780         goal = atomic_read(&ipv4_dst_ops.entries) -
 781                 (ip_rt_gc_elasticity << rt_hash_log);
 782         if (goal <= 0) {
 783                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 784                         equilibrium = ipv4_dst_ops.gc_thresh;
 785                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 786                 if (goal > 0) {
 787                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 788                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 789                 }
 790         } else {
 791                 /* We are in dangerous area. Try to reduce cache really
 792                  * aggressively.
 793                  */
 794                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 795                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 796         }
 797
 798         if (now - last_gc >= ip_rt_gc_min_interval)
 799                 last_gc = now;
 800
 801         if (goal <= 0) {
 802                 equilibrium += goal;
 803                 goto work_done;
 804         }
 805
 806         do {
 807                 int i, k;
 808
 809                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 810                         unsigned long tmo = expire;
 811
 812                         k = (k + 1) & rt_hash_mask;
 813                         rthp = &rt_hash_table[k].chain;
 814                         spin_lock_bh(rt_hash_lock_addr(k));
 815                         while ((rth = *rthp) != NULL) {
 816                                 if (!rt_may_expire(rth, tmo, expire)) {
 817                                         tmo >>= 1;
 818                                         rthp = &rth->u.rt_next;
 819                                         continue;
 820                                 }
 821 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 822                                 /* remove all related balanced entries
 823                                  * if necessary
 824                                  */
 825                                 if (rth->u.dst.flags & DST_BALANCED) {
 826                                         int r;
 827
 828                                         rthp = rt_remove_balanced_route(
 829                                                 &rt_hash_table[i].chain,
 830                                                 rth,
 831                                                 &r);
 832                                         goal -= r;
 833                                         if (!rthp)
 834                                                 break;
 835                                 } else {
 836                                         *rthp = rth->u.rt_next;
 837                                         rt_free(rth);
 838                                         goal--;
 839                                 }
 840 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 841                                 *rthp = rth->u.rt_next;
 842                                 rt_free(rth);
 843                                 goal--;
 844 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 845                         }
 846                         spin_unlock_bh(rt_hash_lock_addr(k));
 847                         if (goal <= 0)
 848                                 break;
 849                 }
 850                 rover = k;
 851
 852                 if (goal <= 0)
 853                         goto work_done;
 854
 855                 /* Goal is not achieved. We stop process if:
 856
 857                    - if expire reduced to zero. Otherwise, expire is halfed.
 858                    - if table is not full.
 859                    - if we are called from interrupt.
 860                    - jiffies check is just fallback/debug loop breaker.
 861                      We will not spin here for long time in any case.
 862                  */
 863
 864                 RT_CACHE_STAT_INC(gc_goal_miss);
 865
 866                 if (expire == 0)
 867                         break;
 868
 869                 expire >>= 1;
 870 #if RT_CACHE_DEBUG >= 2
 871                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 872                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 873 #endif
 874
 875                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 876                         goto out;
 877         } while (!in_softirq() && time_before_eq(jiffies, now));
 878
 879         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 880                 goto out;
 881         if (net_ratelimit())
 882                 printk(KERN_WARNING "dst cache overflow\n");
 883         RT_CACHE_STAT_INC(gc_dst_overflow);
 884         return 1;
 885
 886 work_done:
 887         expire += ip_rt_gc_min_interval;
 888         if (expire > ip_rt_gc_timeout ||
 889             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 890                 expire = ip_rt_gc_timeout;
 891 #if RT_CACHE_DEBUG >= 2
 892         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 893                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 894 #endif
 895 out:    return 0;
 896 }
 897
 898 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 899 {
 900         struct rtable   *rth, **rthp;
 901         unsigned long   now;
 902         struct rtable *cand, **candp;
 903         u32             min_score;
 904         int             chain_length;
 905         int attempts = !in_softirq();
 906
 907 restart:
 908         chain_length = 0;
 909         min_score = ~(u32)0;
 910         cand = NULL;
 911         candp = NULL;
 912         now = jiffies;
 913
 914         rthp = &rt_hash_table[hash].chain;
 915
 916         spin_lock_bh(rt_hash_lock_addr(hash));
 917         while ((rth = *rthp) != NULL) {
 918 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 919                 if (!(rth->u.dst.flags & DST_BALANCED) &&
 920                     compare_keys(&rth->fl, &rt->fl)) {
 921 #else
 922                 if (compare_keys(&rth->fl, &rt->fl)) {
 923 #endif
 924                         /* Put it first */
 925                         *rthp = rth->u.rt_next;
 926                         /*
 927                          * Since lookup is lockfree, the deletion
 928                          * must be visible to another weakly ordered CPU before
 929                          * the insertion at the start of the hash chain.
 930                          */
 931                         rcu_assign_pointer(rth->u.rt_next,
 932                                            rt_hash_table[hash].chain);
 933                         /*
 934                          * Since lookup is lockfree, the update writes
 935                          * must be ordered for consistency on SMP.
 936                          */
 937                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 938
 939                         rth->u.dst.__use++;
 940                         dst_hold(&rth->u.dst);
 941                         rth->u.dst.lastuse = now;
 942                         spin_unlock_bh(rt_hash_lock_addr(hash));
 943
 944                         rt_drop(rt);
 945                         *rp = rth;
 946                         return 0;
 947                 }
 948
 949                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 950                         u32 score = rt_score(rth);
 951
 952                         if (score <= min_score) {
 953                                 cand = rth;
 954                                 candp = rthp;
 955                                 min_score = score;
 956                         }
 957                 }
 958
 959                 chain_length++;
 960
 961                 rthp = &rth->u.rt_next;
 962         }
 963
 964         if (cand) {
 965                 /* ip_rt_gc_elasticity used to be average length of chain
 966                  * length, when exceeded gc becomes really aggressive.
 967                  *
 968                  * The second limit is less certain. At the moment it allows
 969                  * only 2 entries per bucket. We will see.
 970                  */
 971                 if (chain_length > ip_rt_gc_elasticity) {
 972                         *candp = cand->u.rt_next;
 973                         rt_free(cand);
 974                 }
 975         }
 976
 977         /* Try to bind route to arp only if it is output
 978            route or unicast forwarding path.
 979          */
 980         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 981                 int err = arp_bind_neighbour(&rt->u.dst);
 982                 if (err) {
 983                         spin_unlock_bh(rt_hash_lock_addr(hash));
 984
 985                         if (err != -ENOBUFS) {
 986                                 rt_drop(rt);
 987                                 return err;
 988                         }
 989
 990                         /* Neighbour tables are full and nothing
 991                            can be released. Try to shrink route cache,
 992                            it is most likely it holds some neighbour records.
 993                          */
 994                         if (attempts-- > 0) {
 995                                 int saved_elasticity = ip_rt_gc_elasticity;
 996                                 int saved_int = ip_rt_gc_min_interval;
 997                                 ip_rt_gc_elasticity     = 1;
 998                                 ip_rt_gc_min_interval   = 0;
 999                                 rt_garbage_collect();
1000                                 ip_rt_gc_min_interval   = saved_int;
1001                                 ip_rt_gc_elasticity     = saved_elasticity;
1002                                 goto restart;
1003                         }
1004
1005                         if (net_ratelimit())
1006                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1007                         rt_drop(rt);
1008                         return -ENOBUFS;
1009                 }
1010         }
1011
1012         rt->u.rt_next = rt_hash_table[hash].chain;
1013 #if RT_CACHE_DEBUG >= 2
1014         if (rt->u.rt_next) {
1015                 struct rtable *trt;
1016                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1017                        NIPQUAD(rt->rt_dst));
1018                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1019                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1020                 printk("\n");
1021         }
1022 #endif
1023         rt_hash_table[hash].chain = rt;
1024         spin_unlock_bh(rt_hash_lock_addr(hash));
1025         *rp = rt;
1026         return 0;
1027 }
1028
1029 void rt_bind_peer(struct rtable *rt, int create)
1030 {
1031         static DEFINE_SPINLOCK(rt_peer_lock);
1032         struct inet_peer *peer;
1033
1034         peer = inet_getpeer(rt->rt_dst, create);
1035
1036         spin_lock_bh(&rt_peer_lock);
1037         if (rt->peer == NULL) {
1038                 rt->peer = peer;
1039                 peer = NULL;
1040         }
1041         spin_unlock_bh(&rt_peer_lock);
1042         if (peer)
1043                 inet_putpeer(peer);
1044 }
1045
1046 /*
1047  * Peer allocation may fail only in serious out-of-memory conditions.  However
1048  * we still can generate some output.
1049  * Random ID selection looks a bit dangerous because we have no chances to
1050  * select ID being unique in a reasonable period of time.
1051  * But broken packet identifier may be better than no packet at all.
1052  */
1053 static void ip_select_fb_ident(struct iphdr *iph)
1054 {
1055         static DEFINE_SPINLOCK(ip_fb_id_lock);
1056         static u32 ip_fallback_id;
1057         u32 salt;
1058
1059         spin_lock_bh(&ip_fb_id_lock);
1060         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1061         iph->id = htons(salt & 0xFFFF);
1062         ip_fallback_id = salt;
1063         spin_unlock_bh(&ip_fb_id_lock);
1064 }
1065
1066 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1067 {
1068         struct rtable *rt = (struct rtable *) dst;
1069
1070         if (rt) {
1071                 if (rt->peer == NULL)
1072                         rt_bind_peer(rt, 1);
1073
1074                 /* If peer is attached to destination, it is never detached,
1075                    so that we need not to grab a lock to dereference it.
1076                  */
1077                 if (rt->peer) {
1078                         iph->id = htons(inet_getid(rt->peer, more));
1079                         return;
1080                 }
1081         } else
1082                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1083                        __builtin_return_address(0));
1084
1085         ip_select_fb_ident(iph);
1086 }
1087
1088 static void rt_del(unsigned hash, struct rtable *rt)
1089 {
1090         struct rtable **rthp;
1091
1092         spin_lock_bh(rt_hash_lock_addr(hash));
1093         ip_rt_put(rt);
1094         for (rthp = &rt_hash_table[hash].chain; *rthp;
1095              rthp = &(*rthp)->u.rt_next)
1096                 if (*rthp == rt) {
1097                         *rthp = rt->u.rt_next;
1098                         rt_free(rt);
1099                         break;
1100                 }
1101         spin_unlock_bh(rt_hash_lock_addr(hash));
1102 }
1103
1104 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1105                     u32 saddr, u8 tos, struct net_device *dev)
1106 {
1107         int i, k;
1108         struct in_device *in_dev = in_dev_get(dev);
1109         struct rtable *rth, **rthp;
1110         u32  skeys[2] = { saddr, 0 };
1111         int  ikeys[2] = { dev->ifindex, 0 };
1112
1113         tos &= IPTOS_RT_MASK;
1114
1115         if (!in_dev)
1116                 return;
1117
1118         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1119             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1120                 goto reject_redirect;
1121
1122         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1123                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1124                         goto reject_redirect;
1125                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1126                         goto reject_redirect;
1127         } else {
1128                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1129                         goto reject_redirect;
1130         }
1131
1132         for (i = 0; i < 2; i++) {
1133                 for (k = 0; k < 2; k++) {
1134                         unsigned hash = rt_hash_code(daddr,
1135                                                      skeys[i] ^ (ikeys[k] << 5),
1136                                                      tos);
1137
1138                         rthp=&rt_hash_table[hash].chain;
1139
1140                         rcu_read_lock();
1141                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1142                                 struct rtable *rt;
1143
1144                                 if (rth->fl.fl4_dst != daddr ||
1145                                     rth->fl.fl4_src != skeys[i] ||
1146                                     rth->fl.fl4_tos != tos ||
1147                                     rth->fl.oif != ikeys[k] ||
1148                                     rth->fl.iif != 0) {
1149                                         rthp = &rth->u.rt_next;
1150                                         continue;
1151                                 }
1152
1153                                 if (rth->rt_dst != daddr ||
1154                                     rth->rt_src != saddr ||
1155                                     rth->u.dst.error ||
1156                                     rth->rt_gateway != old_gw ||
1157                                     rth->u.dst.dev != dev)
1158                                         break;
1159
1160                                 dst_hold(&rth->u.dst);
1161                                 rcu_read_unlock();
1162
1163                                 rt = dst_alloc(&ipv4_dst_ops);
1164                                 if (rt == NULL) {
1165                                         ip_rt_put(rth);
1166                                         in_dev_put(in_dev);
1167                                         return;
1168                                 }
1169
1170                                 /* Copy all the information. */
1171                                 *rt = *rth;
1172                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1173                                 rt->u.dst.__use         = 1;
1174                                 atomic_set(&rt->u.dst.__refcnt, 1);
1175                                 rt->u.dst.child         = NULL;
1176                                 if (rt->u.dst.dev)
1177                                         dev_hold(rt->u.dst.dev);
1178                                 if (rt->idev)
1179                                         in_dev_hold(rt->idev);
1180                                 rt->u.dst.obsolete      = 0;
1181                                 rt->u.dst.lastuse       = jiffies;
1182                                 rt->u.dst.path          = &rt->u.dst;
1183                                 rt->u.dst.neighbour     = NULL;
1184                                 rt->u.dst.hh            = NULL;
1185                                 rt->u.dst.xfrm          = NULL;
1186
1187                                 rt->rt_flags            |= RTCF_REDIRECTED;
1188
1189                                 /* Gateway is different ... */
1190                                 rt->rt_gateway          = new_gw;
1191
1192                                 /* Redirect received -> path was valid */
1193                                 dst_confirm(&rth->u.dst);
1194
1195                                 if (rt->peer)
1196                                         atomic_inc(&rt->peer->refcnt);
1197
1198                                 if (arp_bind_neighbour(&rt->u.dst) ||
1199                                     !(rt->u.dst.neighbour->nud_state &
1200                                             NUD_VALID)) {
1201                                         if (rt->u.dst.neighbour)
1202                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1203                                         ip_rt_put(rth);
1204                                         rt_drop(rt);
1205                                         goto do_next;
1206                                 }
1207
1208                                 rt_del(hash, rth);
1209                                 if (!rt_intern_hash(hash, rt, &rt))
1210                                         ip_rt_put(rt);
1211                                 goto do_next;
1212                         }
1213                         rcu_read_unlock();
1214                 do_next:
1215                         ;
1216                 }
1217         }
1218         in_dev_put(in_dev);
1219         return;
1220
1221 reject_redirect:
1222 #ifdef CONFIG_IP_ROUTE_VERBOSE
1223         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1224                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1225                         "%u.%u.%u.%u ignored.\n"
1226                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1227                         "tos %02x\n",
1228                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1229                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
1230 #endif
1231         in_dev_put(in_dev);
1232 }
1233
1234 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1235 {
1236         struct rtable *rt = (struct rtable*)dst;
1237         struct dst_entry *ret = dst;
1238
1239         if (rt) {
1240                 if (dst->obsolete) {
1241                         ip_rt_put(rt);
1242                         ret = NULL;
1243                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1244                            rt->u.dst.expires) {
1245                         unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1246                                                      rt->fl.fl4_src ^
1247                                                         (rt->fl.oif << 5),
1248                                                      rt->fl.fl4_tos);
1249 #if RT_CACHE_DEBUG >= 1
1250                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1251                                           "%u.%u.%u.%u/%02x dropped\n",
1252                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1253 #endif
1254                         rt_del(hash, rt);
1255                         ret = NULL;
1256                 }
1257         }
1258         return ret;
1259 }
1260
1261 /*
1262  * Algorithm:
1263  *      1. The first ip_rt_redirect_number redirects are sent
1264  *         with exponential backoff, then we stop sending them at all,
1265  *         assuming that the host ignores our redirects.
1266  *      2. If we did not see packets requiring redirects
1267  *         during ip_rt_redirect_silence, we assume that the host
1268  *         forgot redirected route and start to send redirects again.
1269  *
1270  * This algorithm is much cheaper and more intelligent than dumb load limiting
1271  * in icmp.c.
1272  *
1273  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1274  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1275  */
1276
1277 void ip_rt_send_redirect(struct sk_buff *skb)
1278 {
1279         struct rtable *rt = (struct rtable*)skb->dst;
1280         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1281
1282         if (!in_dev)
1283                 return;
1284
1285         if (!IN_DEV_TX_REDIRECTS(in_dev))
1286                 goto out;
1287
1288         /* No redirected packets during ip_rt_redirect_silence;
1289          * reset the algorithm.
1290          */
1291         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1292                 rt->u.dst.rate_tokens = 0;
1293
1294         /* Too many ignored redirects; do not send anything
1295          * set u.dst.rate_last to the last seen redirected packet.
1296          */
1297         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1298                 rt->u.dst.rate_last = jiffies;
1299                 goto out;
1300         }
1301
1302         /* Check for load limit; set rate_last to the latest sent
1303          * redirect.
1304          */
1305         if (time_after(jiffies,
1306                        (rt->u.dst.rate_last +
1307                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1308                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1309                 rt->u.dst.rate_last = jiffies;
1310                 ++rt->u.dst.rate_tokens;
1311 #ifdef CONFIG_IP_ROUTE_VERBOSE
1312                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1313                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1314                     net_ratelimit())
1315                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1316                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1317                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1318                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1319 #endif
1320         }
1321 out:
1322         in_dev_put(in_dev);
1323 }
1324
1325 static int ip_error(struct sk_buff *skb)
1326 {
1327         struct rtable *rt = (struct rtable*)skb->dst;
1328         unsigned long now;
1329         int code;
1330
1331         switch (rt->u.dst.error) {
1332                 case EINVAL:
1333                 default:
1334                         goto out;
1335                 case EHOSTUNREACH:
1336                         code = ICMP_HOST_UNREACH;
1337                         break;
1338                 case ENETUNREACH:
1339                         code = ICMP_NET_UNREACH;
1340                         break;
1341                 case EACCES:
1342                         code = ICMP_PKT_FILTERED;
1343                         break;
1344         }
1345
1346         now = jiffies;
1347         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1348         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1349                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1350         rt->u.dst.rate_last = now;
1351         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1352                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1353                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1354         }
1355
1356 out:    kfree_skb(skb);
1357         return 0;
1358 }
1359
1360 /*
1361  *      The last two values are not from the RFC but
1362  *      are needed for AMPRnet AX.25 paths.
1363  */
1364
1365 static unsigned short mtu_plateau[] =
1366 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1367
1368 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1369 {
1370         int i;
1371
1372         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1373                 if (old_mtu > mtu_plateau[i])
1374                         return mtu_plateau[i];
1375         return 68;
1376 }
1377
1378 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1379 {
1380         int i;
1381         unsigned short old_mtu = ntohs(iph->tot_len);
1382         struct rtable *rth;
1383         u32  skeys[2] = { iph->saddr, 0, };
1384         u32  daddr = iph->daddr;
1385         u8   tos = iph->tos & IPTOS_RT_MASK;
1386         unsigned short est_mtu = 0;
1387
1388         if (ipv4_config.no_pmtu_disc)
1389                 return 0;
1390
1391         for (i = 0; i < 2; i++) {
1392                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1393
1394                 rcu_read_lock();
1395                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1396                      rth = rcu_dereference(rth->u.rt_next)) {
1397                         if (rth->fl.fl4_dst == daddr &&
1398                             rth->fl.fl4_src == skeys[i] &&
1399                             rth->rt_dst  == daddr &&
1400                             rth->rt_src  == iph->saddr &&
1401                             rth->fl.fl4_tos == tos &&
1402                             rth->fl.iif == 0 &&
1403                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1404                                 unsigned short mtu = new_mtu;
1405
1406                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1407
1408                                         /* BSD 4.2 compatibility hack :-( */
1409                                         if (mtu == 0 &&
1410                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1411                                             old_mtu >= 68 + (iph->ihl << 2))
1412                                                 old_mtu -= iph->ihl << 2;
1413
1414                                         mtu = guess_mtu(old_mtu);
1415                                 }
1416                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1417                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1418                                                 dst_confirm(&rth->u.dst);
1419                                                 if (mtu < ip_rt_min_pmtu) {
1420                                                         mtu = ip_rt_min_pmtu;
1421                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1422                                                                 (1 << RTAX_MTU);
1423                                                 }
1424                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1425                                                 dst_set_expires(&rth->u.dst,
1426                                                         ip_rt_mtu_expires);
1427                                         }
1428                                         est_mtu = mtu;
1429                                 }
1430                         }
1431                 }
1432                 rcu_read_unlock();
1433         }
1434         return est_mtu ? : new_mtu;
1435 }
1436
1437 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1438 {
1439         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1440             !(dst_metric_locked(dst, RTAX_MTU))) {
1441                 if (mtu < ip_rt_min_pmtu) {
1442                         mtu = ip_rt_min_pmtu;
1443                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1444                 }
1445                 dst->metrics[RTAX_MTU-1] = mtu;
1446                 dst_set_expires(dst, ip_rt_mtu_expires);
1447         }
1448 }
1449
1450 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1451 {
1452         return NULL;
1453 }
1454
1455 static void ipv4_dst_destroy(struct dst_entry *dst)
1456 {
1457         struct rtable *rt = (struct rtable *) dst;
1458         struct inet_peer *peer = rt->peer;
1459         struct in_device *idev = rt->idev;
1460
1461         if (peer) {
1462                 rt->peer = NULL;
1463                 inet_putpeer(peer);
1464         }
1465
1466         if (idev) {
1467                 rt->idev = NULL;
1468                 in_dev_put(idev);
1469         }
1470 }
1471
1472 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1473                             int how)
1474 {
1475         struct rtable *rt = (struct rtable *) dst;
1476         struct in_device *idev = rt->idev;
1477         if (dev != &loopback_dev && idev && idev->dev == dev) {
1478                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1479                 if (loopback_idev) {
1480                         rt->idev = loopback_idev;
1481                         in_dev_put(idev);
1482                 }
1483         }
1484 }
1485
1486 static void ipv4_link_failure(struct sk_buff *skb)
1487 {
1488         struct rtable *rt;
1489
1490         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1491
1492         rt = (struct rtable *) skb->dst;
1493         if (rt)
1494                 dst_set_expires(&rt->u.dst, 0);
1495 }
1496
1497 static int ip_rt_bug(struct sk_buff *skb)
1498 {
1499         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1500                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1501                 skb->dev ? skb->dev->name : "?");
1502         kfree_skb(skb);
1503         return 0;
1504 }
1505
1506 /*
1507    We do not cache source address of outgoing interface,
1508    because it is used only by IP RR, TS and SRR options,
1509    so that it out of fast path.
1510
1511    BTW remember: "addr" is allowed to be not aligned
1512    in IP options!
1513  */
1514
1515 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1516 {
1517         u32 src;
1518         struct fib_result res;
1519
1520         if (rt->fl.iif == 0)
1521                 src = rt->rt_src;
1522         else if (fib_lookup(&rt->fl, &res) == 0) {
1523                 src = FIB_RES_PREFSRC(res);
1524                 fib_res_put(&res);
1525         } else
1526                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1527                                         RT_SCOPE_UNIVERSE);
1528         memcpy(addr, &src, 4);
1529 }
1530
1531 #ifdef CONFIG_NET_CLS_ROUTE
1532 static void set_class_tag(struct rtable *rt, u32 tag)
1533 {
1534         if (!(rt->u.dst.tclassid & 0xFFFF))
1535                 rt->u.dst.tclassid |= tag & 0xFFFF;
1536         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1537                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1538 }
1539 #endif
1540
1541 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1542 {
1543         struct fib_info *fi = res->fi;
1544
1545         if (fi) {
1546                 if (FIB_RES_GW(*res) &&
1547                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1548                         rt->rt_gateway = FIB_RES_GW(*res);
1549                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1550                        sizeof(rt->u.dst.metrics));
1551                 if (fi->fib_mtu == 0) {
1552                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1553                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1554                             rt->rt_gateway != rt->rt_dst &&
1555                             rt->u.dst.dev->mtu > 576)
1556                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1557                 }
1558 #ifdef CONFIG_NET_CLS_ROUTE
1559                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1560 #endif
1561         } else
1562                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1563
1564         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1565                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1566         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1567                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1568         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1569                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1570                                        ip_rt_min_advmss);
1571         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1572                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1573
1574 #ifdef CONFIG_NET_CLS_ROUTE
1575 #ifdef CONFIG_IP_MULTIPLE_TABLES
1576         set_class_tag(rt, fib_rules_tclass(res));
1577 #endif
1578         set_class_tag(rt, itag);
1579 #endif
1580         rt->rt_type = res->type;
1581 }
1582
1583 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1584                                 u8 tos, struct net_device *dev, int our)
1585 {
1586         unsigned hash;
1587         struct rtable *rth;
1588         u32 spec_dst;
1589         struct in_device *in_dev = in_dev_get(dev);
1590         u32 itag = 0;
1591
1592         /* Primary sanity checks. */
1593
1594         if (in_dev == NULL)
1595                 return -EINVAL;
1596
1597         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1598             skb->protocol != htons(ETH_P_IP))
1599                 goto e_inval;
1600
1601         if (ZERONET(saddr)) {
1602                 if (!LOCAL_MCAST(daddr))
1603                         goto e_inval;
1604                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1605         } else if (fib_validate_source(saddr, 0, tos, 0,
1606                                         dev, &spec_dst, &itag) < 0)
1607                 goto e_inval;
1608
1609         rth = dst_alloc(&ipv4_dst_ops);
1610         if (!rth)
1611                 goto e_nobufs;
1612
1613         rth->u.dst.output= ip_rt_bug;
1614
1615         atomic_set(&rth->u.dst.__refcnt, 1);
1616         rth->u.dst.flags= DST_HOST;
1617         if (in_dev->cnf.no_policy)
1618                 rth->u.dst.flags |= DST_NOPOLICY;
1619         rth->fl.fl4_dst = daddr;
1620         rth->rt_dst     = daddr;
1621         rth->fl.fl4_tos = tos;
1622 #ifdef CONFIG_IP_ROUTE_FWMARK
1623         rth->fl.fl4_fwmark= skb->nfmark;
1624 #endif
1625         rth->fl.fl4_src = saddr;
1626         rth->rt_src     = saddr;
1627 #ifdef CONFIG_NET_CLS_ROUTE
1628         rth->u.dst.tclassid = itag;
1629 #endif
1630         rth->rt_iif     =
1631         rth->fl.iif     = dev->ifindex;
1632         rth->u.dst.dev  = &loopback_dev;
1633         dev_hold(rth->u.dst.dev);
1634         rth->idev       = in_dev_get(rth->u.dst.dev);
1635         rth->fl.oif     = 0;
1636         rth->rt_gateway = daddr;
1637         rth->rt_spec_dst= spec_dst;
1638         rth->rt_type    = RTN_MULTICAST;
1639         rth->rt_flags   = RTCF_MULTICAST;
1640         if (our) {
1641                 rth->u.dst.input= ip_local_deliver;
1642                 rth->rt_flags |= RTCF_LOCAL;
1643         }
1644
1645 #ifdef CONFIG_IP_MROUTE
1646         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1647                 rth->u.dst.input = ip_mr_input;
1648 #endif
1649         RT_CACHE_STAT_INC(in_slow_mc);
1650
1651         in_dev_put(in_dev);
1652         hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1653         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1654
1655 e_nobufs:
1656         in_dev_put(in_dev);
1657         return -ENOBUFS;
1658
1659 e_inval:
1660         in_dev_put(in_dev);
1661         return -EINVAL;
1662 }
1663
1664
1665 static void ip_handle_martian_source(struct net_device *dev,
1666                                      struct in_device *in_dev,
1667                                      struct sk_buff *skb,
1668                                      u32 daddr,
1669                                      u32 saddr)
1670 {
1671         RT_CACHE_STAT_INC(in_martian_src);
1672 #ifdef CONFIG_IP_ROUTE_VERBOSE
1673         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1674                 /*
1675                  *      RFC1812 recommendation, if source is martian,
1676                  *      the only hint is MAC header.
1677                  */
1678                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1679                         "%u.%u.%u.%u, on dev %s\n",
1680                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1681                 if (dev->hard_header_len) {
1682                         int i;
1683                         unsigned char *p = skb->mac.raw;
1684                         printk(KERN_WARNING "ll header: ");
1685                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1686                                 printk("%02x", *p);
1687                                 if (i < (dev->hard_header_len - 1))
1688                                         printk(":");
1689                         }
1690                         printk("\n");
1691                 }
1692         }
1693 #endif
1694 }
1695
1696 static inline int __mkroute_input(struct sk_buff *skb,
1697                                   struct fib_result* res,
1698                                   struct in_device *in_dev,
1699                                   u32 daddr, u32 saddr, u32 tos,
1700                                   struct rtable **result)
1701 {
1702
1703         struct rtable *rth;
1704         int err;
1705         struct in_device *out_dev;
1706         unsigned flags = 0;
1707         u32 spec_dst, itag;
1708
1709         /* get a working reference to the output device */
1710         out_dev = in_dev_get(FIB_RES_DEV(*res));
1711         if (out_dev == NULL) {
1712                 if (net_ratelimit())
1713                         printk(KERN_CRIT "Bug in ip_route_input" \
1714                                "_slow(). Please, report\n");
1715                 return -EINVAL;
1716         }
1717
1718
1719         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1720                                   in_dev->dev, &spec_dst, &itag);
1721         if (err < 0) {
1722                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1723                                          saddr);
1724
1725                 err = -EINVAL;
1726                 goto cleanup;
1727         }
1728
1729         if (err)
1730                 flags |= RTCF_DIRECTSRC;
1731
1732         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1733             (IN_DEV_SHARED_MEDIA(out_dev) ||
1734              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1735                 flags |= RTCF_DOREDIRECT;
1736
1737         if (skb->protocol != htons(ETH_P_IP)) {
1738                 /* Not IP (i.e. ARP). Do not create route, if it is
1739                  * invalid for proxy arp. DNAT routes are always valid.
1740                  */
1741                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1742                         err = -EINVAL;
1743                         goto cleanup;
1744                 }
1745         }
1746
1747
1748         rth = dst_alloc(&ipv4_dst_ops);
1749         if (!rth) {
1750                 err = -ENOBUFS;
1751                 goto cleanup;
1752         }
1753
1754         rth->u.dst.flags= DST_HOST;
1755 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1756         if (res->fi->fib_nhs > 1)
1757                 rth->u.dst.flags |= DST_BALANCED;
1758 #endif
1759         if (in_dev->cnf.no_policy)
1760                 rth->u.dst.flags |= DST_NOPOLICY;
1761         if (in_dev->cnf.no_xfrm)
1762                 rth->u.dst.flags |= DST_NOXFRM;
1763         rth->fl.fl4_dst = daddr;
1764         rth->rt_dst     = daddr;
1765         rth->fl.fl4_tos = tos;
1766 #ifdef CONFIG_IP_ROUTE_FWMARK
1767         rth->fl.fl4_fwmark= skb->nfmark;
1768 #endif
1769         rth->fl.fl4_src = saddr;
1770         rth->rt_src     = saddr;
1771         rth->rt_gateway = daddr;
1772         rth->rt_iif     =
1773                 rth->fl.iif     = in_dev->dev->ifindex;
1774         rth->u.dst.dev  = (out_dev)->dev;
1775         dev_hold(rth->u.dst.dev);
1776         rth->idev       = in_dev_get(rth->u.dst.dev);
1777         rth->fl.oif     = 0;
1778         rth->rt_spec_dst= spec_dst;
1779
1780         rth->u.dst.input = ip_forward;
1781         rth->u.dst.output = ip_output;
1782
1783         rt_set_nexthop(rth, res, itag);
1784
1785         rth->rt_flags = flags;
1786
1787         *result = rth;
1788         err = 0;
1789  cleanup:
1790         /* release the working reference to the output device */
1791         in_dev_put(out_dev);
1792         return err;
1793 }
1794
1795 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1796                                        struct fib_result* res,
1797                                        const struct flowi *fl,
1798                                        struct in_device *in_dev,
1799                                        u32 daddr, u32 saddr, u32 tos)
1800 {
1801         struct rtable* rth = NULL;
1802         int err;
1803         unsigned hash;
1804
1805 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1806         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1807                 fib_select_multipath(fl, res);
1808 #endif
1809
1810         /* create a routing cache entry */
1811         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1812         if (err)
1813                 return err;
1814         atomic_set(&rth->u.dst.__refcnt, 1);
1815
1816         /* put it into the cache */
1817         hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1818         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1819 }
1820
1821 static inline int ip_mkroute_input(struct sk_buff *skb,
1822                                    struct fib_result* res,
1823                                    const struct flowi *fl,
1824                                    struct in_device *in_dev,
1825                                    u32 daddr, u32 saddr, u32 tos)
1826 {
1827 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1828         struct rtable* rth = NULL;
1829         unsigned char hop, hopcount, lasthop;
1830         int err = -EINVAL;
1831         unsigned int hash;
1832
1833         if (res->fi)
1834                 hopcount = res->fi->fib_nhs;
1835         else
1836                 hopcount = 1;
1837
1838         lasthop = hopcount - 1;
1839
1840         /* distinguish between multipath and singlepath */
1841         if (hopcount < 2)
1842                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1843                                             saddr, tos);
1844
1845         /* add all alternatives to the routing cache */
1846         for (hop = 0; hop < hopcount; hop++) {
1847                 res->nh_sel = hop;
1848
1849                 /* create a routing cache entry */
1850                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1851                                       &rth);
1852                 if (err)
1853                         return err;
1854
1855                 /* put it into the cache */
1856                 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1857                 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1858                 if (err)
1859                         return err;
1860
1861                 /* forward hop information to multipath impl. */
1862                 multipath_set_nhinfo(rth,
1863                                      FIB_RES_NETWORK(*res),
1864                                      FIB_RES_NETMASK(*res),
1865                                      res->prefixlen,
1866                                      &FIB_RES_NH(*res));
1867
1868                 /* only for the last hop the reference count is handled
1869                  * outside
1870                  */
1871                 if (hop == lasthop)
1872                         atomic_set(&(skb->dst->__refcnt), 1);
1873         }
1874         return err;
1875 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1876         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1877 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1878 }
1879
1880
1881 /*
1882  *      NOTE. We drop all the packets that has local source
1883  *      addresses, because every properly looped back packet
1884  *      must have correct destination already attached by output routine.
1885  *
1886  *      Such approach solves two big problems:
1887  *      1. Not simplex devices are handled properly.
1888  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1889  */
1890
1891 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1892                                u8 tos, struct net_device *dev)
1893 {
1894         struct fib_result res;
1895         struct in_device *in_dev = in_dev_get(dev);
1896         struct flowi fl = { .nl_u = { .ip4_u =
1897                                       { .daddr = daddr,
1898                                         .saddr = saddr,
1899                                         .tos = tos,
1900                                         .scope = RT_SCOPE_UNIVERSE,
1901 #ifdef CONFIG_IP_ROUTE_FWMARK
1902                                         .fwmark = skb->nfmark
1903 #endif
1904                                       } },
1905                             .iif = dev->ifindex };
1906         unsigned        flags = 0;
1907         u32             itag = 0;
1908         struct rtable * rth;
1909         unsigned        hash;
1910         u32             spec_dst;
1911         int             err = -EINVAL;
1912         int             free_res = 0;
1913
1914         /* IP on this device is disabled. */
1915
1916         if (!in_dev)
1917                 goto out;
1918
1919         /* Check for the most weird martians, which can be not detected
1920            by fib_lookup.
1921          */
1922
1923         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1924                 goto martian_source;
1925
1926         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1927                 goto brd_input;
1928
1929         /* Accept zero addresses only to limited broadcast;
1930          * I even do not know to fix it or not. Waiting for complains :-)
1931          */
1932         if (ZERONET(saddr))
1933                 goto martian_source;
1934
1935         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1936                 goto martian_destination;
1937
1938         /*
1939          *      Now we are ready to route packet.
1940          */
1941         if ((err = fib_lookup(&fl, &res)) != 0) {
1942                 if (!IN_DEV_FORWARD(in_dev))
1943                         goto e_hostunreach;
1944                 goto no_route;
1945         }
1946         free_res = 1;
1947
1948         RT_CACHE_STAT_INC(in_slow_tot);
1949
1950         if (res.type == RTN_BROADCAST)
1951                 goto brd_input;
1952
1953         if (res.type == RTN_LOCAL) {
1954                 int result;
1955                 result = fib_validate_source(saddr, daddr, tos,
1956                                              loopback_dev.ifindex,
1957                                              dev, &spec_dst, &itag);
1958                 if (result < 0)
1959                         goto martian_source;
1960                 if (result)
1961                         flags |= RTCF_DIRECTSRC;
1962                 spec_dst = daddr;
1963                 goto local_input;
1964         }
1965
1966         if (!IN_DEV_FORWARD(in_dev))
1967                 goto e_hostunreach;
1968         if (res.type != RTN_UNICAST)
1969                 goto martian_destination;
1970
1971         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1972         if (err == -ENOBUFS)
1973                 goto e_nobufs;
1974         if (err == -EINVAL)
1975                 goto e_inval;
1976
1977 done:
1978         in_dev_put(in_dev);
1979         if (free_res)
1980                 fib_res_put(&res);
1981 out:    return err;
1982
1983 brd_input:
1984         if (skb->protocol != htons(ETH_P_IP))
1985                 goto e_inval;
1986
1987         if (ZERONET(saddr))
1988                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1989         else {
1990                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1991                                           &itag);
1992                 if (err < 0)
1993                         goto martian_source;
1994                 if (err)
1995                         flags |= RTCF_DIRECTSRC;
1996         }
1997         flags |= RTCF_BROADCAST;
1998         res.type = RTN_BROADCAST;
1999         RT_CACHE_STAT_INC(in_brd);
2000
2001 local_input:
2002         rth = dst_alloc(&ipv4_dst_ops);
2003         if (!rth)
2004                 goto e_nobufs;
2005
2006         rth->u.dst.output= ip_rt_bug;
2007
2008         atomic_set(&rth->u.dst.__refcnt, 1);
2009         rth->u.dst.flags= DST_HOST;
2010         if (in_dev->cnf.no_policy)
2011                 rth->u.dst.flags |= DST_NOPOLICY;
2012         rth->fl.fl4_dst = daddr;
2013         rth->rt_dst     = daddr;
2014         rth->fl.fl4_tos = tos;
2015 #ifdef CONFIG_IP_ROUTE_FWMARK
2016         rth->fl.fl4_fwmark= skb->nfmark;
2017 #endif
2018         rth->fl.fl4_src = saddr;
2019         rth->rt_src     = saddr;
2020 #ifdef CONFIG_NET_CLS_ROUTE
2021         rth->u.dst.tclassid = itag;
2022 #endif
2023         rth->rt_iif     =
2024         rth->fl.iif     = dev->ifindex;
2025         rth->u.dst.dev  = &loopback_dev;
2026         dev_hold(rth->u.dst.dev);
2027         rth->idev       = in_dev_get(rth->u.dst.dev);
2028         rth->rt_gateway = daddr;
2029         rth->rt_spec_dst= spec_dst;
2030         rth->u.dst.input= ip_local_deliver;
2031         rth->rt_flags   = flags|RTCF_LOCAL;
2032         if (res.type == RTN_UNREACHABLE) {
2033                 rth->u.dst.input= ip_error;
2034                 rth->u.dst.error= -err;
2035                 rth->rt_flags   &= ~RTCF_LOCAL;
2036         }
2037         rth->rt_type    = res.type;
2038         hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2039         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2040         goto done;
2041
2042 no_route:
2043         RT_CACHE_STAT_INC(in_no_route);
2044         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2045         res.type = RTN_UNREACHABLE;
2046         goto local_input;
2047
2048         /*
2049          *      Do not cache martian addresses: they should be logged (RFC1812)
2050          */
2051 martian_destination:
2052         RT_CACHE_STAT_INC(in_martian_dst);
2053 #ifdef CONFIG_IP_ROUTE_VERBOSE
2054         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2055                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2056                         "%u.%u.%u.%u, dev %s\n",
2057                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2058 #endif
2059
2060 e_hostunreach:
2061         err = -EHOSTUNREACH;
2062         goto done;
2063
2064 e_inval:
2065         err = -EINVAL;
2066         goto done;
2067
2068 e_nobufs:
2069         err = -ENOBUFS;
2070         goto done;
2071
2072 martian_source:
2073         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2074         goto e_inval;
2075 }
2076
2077 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2078                    u8 tos, struct net_device *dev)
2079 {
2080         struct rtable * rth;
2081         unsigned        hash;
2082         int iif = dev->ifindex;
2083
2084         tos &= IPTOS_RT_MASK;
2085         hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2086
2087         rcu_read_lock();
2088         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2089              rth = rcu_dereference(rth->u.rt_next)) {
2090                 if (rth->fl.fl4_dst == daddr &&
2091                     rth->fl.fl4_src == saddr &&
2092                     rth->fl.iif == iif &&
2093                     rth->fl.oif == 0 &&
2094 #ifdef CONFIG_IP_ROUTE_FWMARK
2095                     rth->fl.fl4_fwmark == skb->nfmark &&
2096 #endif
2097                     rth->fl.fl4_tos == tos) {
2098                         rth->u.dst.lastuse = jiffies;
2099                         dst_hold(&rth->u.dst);
2100                         rth->u.dst.__use++;
2101                         RT_CACHE_STAT_INC(in_hit);
2102                         rcu_read_unlock();
2103                         skb->dst = (struct dst_entry*)rth;
2104                         return 0;
2105                 }
2106                 RT_CACHE_STAT_INC(in_hlist_search);
2107         }
2108         rcu_read_unlock();
2109
2110         /* Multicast recognition logic is moved from route cache to here.
2111            The problem was that too many Ethernet cards have broken/missing
2112            hardware multicast filters :-( As result the host on multicasting
2113            network acquires a lot of useless route cache entries, sort of
2114            SDR messages from all the world. Now we try to get rid of them.
2115            Really, provided software IP multicast filter is organized
2116            reasonably (at least, hashed), it does not result in a slowdown
2117            comparing with route cache reject entries.
2118            Note, that multicast routers are not affected, because
2119            route cache entry is created eventually.
2120          */
2121         if (MULTICAST(daddr)) {
2122                 struct in_device *in_dev;
2123
2124                 rcu_read_lock();
2125                 if ((in_dev = __in_dev_get(dev)) != NULL) {
2126                         int our = ip_check_mc(in_dev, daddr, saddr,
2127                                 skb->nh.iph->protocol);
2128                         if (our
2129 #ifdef CONFIG_IP_MROUTE
2130                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2131 #endif
2132                             ) {
2133                                 rcu_read_unlock();
2134                                 return ip_route_input_mc(skb, daddr, saddr,
2135                                                          tos, dev, our);
2136                         }
2137                 }
2138                 rcu_read_unlock();
2139                 return -EINVAL;
2140         }
2141         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2142 }
2143
2144 static inline int __mkroute_output(struct rtable **result,
2145                                    struct fib_result* res,
2146                                    const struct flowi *fl,
2147                                    const struct flowi *oldflp,
2148                                    struct net_device *dev_out,
2149                                    unsigned flags)
2150 {
2151         struct rtable *rth;
2152         struct in_device *in_dev;
2153         u32 tos = RT_FL_TOS(oldflp);
2154         int err = 0;
2155
2156         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2157                 return -EINVAL;
2158
2159         if (fl->fl4_dst == 0xFFFFFFFF)
2160                 res->type = RTN_BROADCAST;
2161         else if (MULTICAST(fl->fl4_dst))
2162                 res->type = RTN_MULTICAST;
2163         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2164                 return -EINVAL;
2165
2166         if (dev_out->flags & IFF_LOOPBACK)
2167                 flags |= RTCF_LOCAL;
2168
2169         /* get work reference to inet device */
2170         in_dev = in_dev_get(dev_out);
2171         if (!in_dev)
2172                 return -EINVAL;
2173
2174         if (res->type == RTN_BROADCAST) {
2175                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2176                 if (res->fi) {
2177                         fib_info_put(res->fi);
2178                         res->fi = NULL;
2179                 }
2180         } else if (res->type == RTN_MULTICAST) {
2181                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2182                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2183                                  oldflp->proto))
2184                         flags &= ~RTCF_LOCAL;
2185                 /* If multicast route do not exist use
2186                    default one, but do not gateway in this case.
2187                    Yes, it is hack.
2188                  */
2189                 if (res->fi && res->prefixlen < 4) {
2190                         fib_info_put(res->fi);
2191                         res->fi = NULL;
2192                 }
2193         }
2194
2195
2196         rth = dst_alloc(&ipv4_dst_ops);
2197         if (!rth) {
2198                 err = -ENOBUFS;
2199                 goto cleanup;
2200         }
2201
2202         rth->u.dst.flags= DST_HOST;
2203 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2204         if (res->fi) {
2205                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2206                 if (res->fi->fib_nhs > 1)
2207                         rth->u.dst.flags |= DST_BALANCED;
2208         }
2209 #endif
2210         if (in_dev->cnf.no_xfrm)
2211                 rth->u.dst.flags |= DST_NOXFRM;
2212         if (in_dev->cnf.no_policy)
2213                 rth->u.dst.flags |= DST_NOPOLICY;
2214
2215         rth->fl.fl4_dst = oldflp->fl4_dst;
2216         rth->fl.fl4_tos = tos;
2217         rth->fl.fl4_src = oldflp->fl4_src;
2218         rth->fl.oif     = oldflp->oif;
2219 #ifdef CONFIG_IP_ROUTE_FWMARK
2220         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2221 #endif
2222         rth->rt_dst     = fl->fl4_dst;
2223         rth->rt_src     = fl->fl4_src;
2224         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2225         /* get references to the devices that are to be hold by the routing
2226            cache entry */
2227         rth->u.dst.dev  = dev_out;
2228         dev_hold(dev_out);
2229         rth->idev       = in_dev_get(dev_out);
2230         rth->rt_gateway = fl->fl4_dst;
2231         rth->rt_spec_dst= fl->fl4_src;
2232
2233         rth->u.dst.output=ip_output;
2234
2235         RT_CACHE_STAT_INC(out_slow_tot);
2236
2237         if (flags & RTCF_LOCAL) {
2238                 rth->u.dst.input = ip_local_deliver;
2239                 rth->rt_spec_dst = fl->fl4_dst;
2240         }
2241         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2242                 rth->rt_spec_dst = fl->fl4_src;
2243                 if (flags & RTCF_LOCAL &&
2244                     !(dev_out->flags & IFF_LOOPBACK)) {
2245                         rth->u.dst.output = ip_mc_output;
2246                         RT_CACHE_STAT_INC(out_slow_mc);
2247                 }
2248 #ifdef CONFIG_IP_MROUTE
2249                 if (res->type == RTN_MULTICAST) {
2250                         if (IN_DEV_MFORWARD(in_dev) &&
2251                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2252                                 rth->u.dst.input = ip_mr_input;
2253                                 rth->u.dst.output = ip_mc_output;
2254                         }
2255                 }
2256 #endif
2257         }
2258
2259         rt_set_nexthop(rth, res, 0);
2260
2261         rth->rt_flags = flags;
2262
2263         *result = rth;
2264  cleanup:
2265         /* release work reference to inet device */
2266         in_dev_put(in_dev);
2267
2268         return err;
2269 }
2270
2271 static inline int ip_mkroute_output_def(struct rtable **rp,
2272                                         struct fib_result* res,
2273                                         const struct flowi *fl,
2274                                         const struct flowi *oldflp,
2275                                         struct net_device *dev_out,
2276                                         unsigned flags)
2277 {
2278         struct rtable *rth = NULL;
2279         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2280         unsigned hash;
2281         if (err == 0) {
2282                 u32 tos = RT_FL_TOS(oldflp);
2283
2284                 atomic_set(&rth->u.dst.__refcnt, 1);
2285
2286                 hash = rt_hash_code(oldflp->fl4_dst,
2287                                     oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2288                 err = rt_intern_hash(hash, rth, rp);
2289         }
2290
2291         return err;
2292 }
2293
2294 static inline int ip_mkroute_output(struct rtable** rp,
2295                                     struct fib_result* res,
2296                                     const struct flowi *fl,
2297                                     const struct flowi *oldflp,
2298                                     struct net_device *dev_out,
2299                                     unsigned flags)
2300 {
2301 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2302         u32 tos = RT_FL_TOS(oldflp);
2303         unsigned char hop;
2304         unsigned hash;
2305         int err = -EINVAL;
2306         struct rtable *rth = NULL;
2307
2308         if (res->fi && res->fi->fib_nhs > 1) {
2309                 unsigned char hopcount = res->fi->fib_nhs;
2310
2311                 for (hop = 0; hop < hopcount; hop++) {
2312                         struct net_device *dev2nexthop;
2313
2314                         res->nh_sel = hop;
2315
2316                         /* hold a work reference to the output device */
2317                         dev2nexthop = FIB_RES_DEV(*res);
2318                         dev_hold(dev2nexthop);
2319
2320                         err = __mkroute_output(&rth, res, fl, oldflp,
2321                                                dev2nexthop, flags);
2322
2323                         if (err != 0)
2324                                 goto cleanup;
2325
2326                         hash = rt_hash_code(oldflp->fl4_dst,
2327                                             oldflp->fl4_src ^
2328                                             (oldflp->oif << 5), tos);
2329                         err = rt_intern_hash(hash, rth, rp);
2330
2331                         /* forward hop information to multipath impl. */
2332                         multipath_set_nhinfo(rth,
2333                                              FIB_RES_NETWORK(*res),
2334                                              FIB_RES_NETMASK(*res),
2335                                              res->prefixlen,
2336                                              &FIB_RES_NH(*res));
2337                 cleanup:
2338                         /* release work reference to output device */
2339                         dev_put(dev2nexthop);
2340
2341                         if (err != 0)
2342                                 return err;
2343                 }
2344                 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2345                 return err;
2346         } else {
2347                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2348                                              flags);
2349         }
2350 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2351         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2352 #endif
2353 }
2354
2355 /*
2356  * Major route resolver routine.
2357  */
2358
2359 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2360 {
2361         u32 tos = RT_FL_TOS(oldflp);
2362         struct flowi fl = { .nl_u = { .ip4_u =
2363                                       { .daddr = oldflp->fl4_dst,
2364                                         .saddr = oldflp->fl4_src,
2365                                         .tos = tos & IPTOS_RT_MASK,
2366                                         .scope = ((tos & RTO_ONLINK) ?
2367                                                   RT_SCOPE_LINK :
2368                                                   RT_SCOPE_UNIVERSE),
2369 #ifdef CONFIG_IP_ROUTE_FWMARK
2370                                         .fwmark = oldflp->fl4_fwmark
2371 #endif
2372                                       } },
2373                             .iif = loopback_dev.ifindex,
2374                             .oif = oldflp->oif };
2375         struct fib_result res;
2376         unsigned flags = 0;
2377         struct net_device *dev_out = NULL;
2378         int free_res = 0;
2379         int err;
2380
2381
2382         res.fi          = NULL;
2383 #ifdef CONFIG_IP_MULTIPLE_TABLES
2384         res.r           = NULL;
2385 #endif
2386
2387         if (oldflp->fl4_src) {
2388                 err = -EINVAL;
2389                 if (MULTICAST(oldflp->fl4_src) ||
2390                     BADCLASS(oldflp->fl4_src) ||
2391                     ZERONET(oldflp->fl4_src))
2392                         goto out;
2393
2394                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2395                 dev_out = ip_dev_find(oldflp->fl4_src);
2396                 if (dev_out == NULL)
2397                         goto out;
2398
2399                 /* I removed check for oif == dev_out->oif here.
2400                    It was wrong for two reasons:
2401                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2402                       assigned to multiple interfaces.
2403                    2. Moreover, we are allowed to send packets with saddr
2404                       of another iface. --ANK
2405                  */
2406
2407                 if (oldflp->oif == 0
2408                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2409                         /* Special hack: user can direct multicasts
2410                            and limited broadcast via necessary interface
2411                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2412                            This hack is not just for fun, it allows
2413                            vic,vat and friends to work.
2414                            They bind socket to loopback, set ttl to zero
2415                            and expect that it will work.
2416                            From the viewpoint of routing cache they are broken,
2417                            because we are not allowed to build multicast path
2418                            with loopback source addr (look, routing cache
2419                            cannot know, that ttl is zero, so that packet
2420                            will not leave this host and route is valid).
2421                            Luckily, this hack is good workaround.
2422                          */
2423
2424                         fl.oif = dev_out->ifindex;
2425                         goto make_route;
2426                 }
2427                 if (dev_out)
2428                         dev_put(dev_out);
2429                 dev_out = NULL;
2430         }
2431
2432
2433         if (oldflp->oif) {
2434                 dev_out = dev_get_by_index(oldflp->oif);
2435                 err = -ENODEV;
2436                 if (dev_out == NULL)
2437                         goto out;
2438                 if (__in_dev_get(dev_out) == NULL) {
2439                         dev_put(dev_out);
2440                         goto out;       /* Wrong error code */
2441                 }
2442
2443                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2444                         if (!fl.fl4_src)
2445                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2446                                                               RT_SCOPE_LINK);
2447                         goto make_route;
2448                 }
2449                 if (!fl.fl4_src) {
2450                         if (MULTICAST(oldflp->fl4_dst))
2451                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2452                                                               fl.fl4_scope);
2453                         else if (!oldflp->fl4_dst)
2454                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2455                                                               RT_SCOPE_HOST);
2456                 }
2457         }
2458
2459         if (!fl.fl4_dst) {
2460                 fl.fl4_dst = fl.fl4_src;
2461                 if (!fl.fl4_dst)
2462                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2463                 if (dev_out)
2464                         dev_put(dev_out);
2465                 dev_out = &loopback_dev;
2466                 dev_hold(dev_out);
2467                 fl.oif = loopback_dev.ifindex;
2468                 res.type = RTN_LOCAL;
2469                 flags |= RTCF_LOCAL;
2470                 goto make_route;
2471         }
2472
2473         if (fib_lookup(&fl, &res)) {
2474                 res.fi = NULL;
2475                 if (oldflp->oif) {
2476                         /* Apparently, routing tables are wrong. Assume,
2477                            that the destination is on link.
2478
2479                            WHY? DW.
2480                            Because we are allowed to send to iface
2481                            even if it has NO routes and NO assigned
2482                            addresses. When oif is specified, routing
2483                            tables are looked up with only one purpose:
2484                            to catch if destination is gatewayed, rather than
2485                            direct. Moreover, if MSG_DONTROUTE is set,
2486                            we send packet, ignoring both routing tables
2487                            and ifaddr state. --ANK
2488
2489
2490                            We could make it even if oif is unknown,
2491                            likely IPv6, but we do not.
2492                          */
2493
2494                         if (fl.fl4_src == 0)
2495                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2496                                                               RT_SCOPE_LINK);
2497                         res.type = RTN_UNICAST;
2498                         goto make_route;
2499                 }
2500                 if (dev_out)
2501                         dev_put(dev_out);
2502                 err = -ENETUNREACH;
2503                 goto out;
2504         }
2505         free_res = 1;
2506
2507         if (res.type == RTN_LOCAL) {
2508                 if (!fl.fl4_src)
2509                         fl.fl4_src = fl.fl4_dst;
2510                 if (dev_out)
2511                         dev_put(dev_out);
2512                 dev_out = &loopback_dev;
2513                 dev_hold(dev_out);
2514                 fl.oif = dev_out->ifindex;
2515                 if (res.fi)
2516                         fib_info_put(res.fi);
2517                 res.fi = NULL;
2518                 flags |= RTCF_LOCAL;
2519                 goto make_route;
2520         }
2521
2522 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2523         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2524                 fib_select_multipath(&fl, &res);
2525         else
2526 #endif
2527         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2528                 fib_select_default(&fl, &res);
2529
2530         if (!fl.fl4_src)
2531                 fl.fl4_src = FIB_RES_PREFSRC(res);
2532
2533         if (dev_out)
2534                 dev_put(dev_out);
2535         dev_out = FIB_RES_DEV(res);
2536         dev_hold(dev_out);
2537         fl.oif = dev_out->ifindex;
2538
2539
2540 make_route:
2541         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2542
2543
2544         if (free_res)
2545                 fib_res_put(&res);
2546         if (dev_out)
2547                 dev_put(dev_out);
2548 out:    return err;
2549 }
2550
2551 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2552 {
2553         unsigned hash;
2554         struct rtable *rth;
2555
2556         hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2557
2558         rcu_read_lock_bh();
2559         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2560                 rth = rcu_dereference(rth->u.rt_next)) {
2561                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2562                     rth->fl.fl4_src == flp->fl4_src &&
2563                     rth->fl.iif == 0 &&
2564                     rth->fl.oif == flp->oif &&
2565 #ifdef CONFIG_IP_ROUTE_FWMARK
2566                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2567 #endif
2568                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2569                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2570
2571                         /* check for multipath routes and choose one if
2572                          * necessary
2573                          */
2574                         if (multipath_select_route(flp, rth, rp)) {
2575                                 dst_hold(&(*rp)->u.dst);
2576                                 RT_CACHE_STAT_INC(out_hit);
2577                                 rcu_read_unlock_bh();
2578                                 return 0;
2579                         }
2580
2581                         rth->u.dst.lastuse = jiffies;
2582                         dst_hold(&rth->u.dst);
2583                         rth->u.dst.__use++;
2584                         RT_CACHE_STAT_INC(out_hit);
2585                         rcu_read_unlock_bh();
2586                         *rp = rth;
2587                         return 0;
2588                 }
2589                 RT_CACHE_STAT_INC(out_hlist_search);
2590         }
2591         rcu_read_unlock_bh();
2592
2593         return ip_route_output_slow(rp, flp);
2594 }
2595
2596 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2597 {
2598         int err;
2599
2600         if ((err = __ip_route_output_key(rp, flp)) != 0)
2601                 return err;
2602
2603         if (flp->proto) {
2604                 if (!flp->fl4_src)
2605                         flp->fl4_src = (*rp)->rt_src;
2606                 if (!flp->fl4_dst)
2607                         flp->fl4_dst = (*rp)->rt_dst;
2608                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2609         }
2610
2611         return 0;
2612 }
2613
2614 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2615 {
2616         return ip_route_output_flow(rp, flp, NULL, 0);
2617 }
2618
2619 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2620                         int nowait, unsigned int flags)
2621 {
2622         struct rtable *rt = (struct rtable*)skb->dst;
2623         struct rtmsg *r;
2624         struct nlmsghdr  *nlh;
2625         unsigned char    *b = skb->tail;
2626         struct rta_cacheinfo ci;
2627 #ifdef CONFIG_IP_MROUTE
2628         struct rtattr *eptr;
2629 #endif
2630         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
2631         r = NLMSG_DATA(nlh);
2632         r->rtm_family    = AF_INET;
2633         r->rtm_dst_len  = 32;
2634         r->rtm_src_len  = 0;
2635         r->rtm_tos      = rt->fl.fl4_tos;
2636         r->rtm_table    = RT_TABLE_MAIN;
2637         r->rtm_type     = rt->rt_type;
2638         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2639         r->rtm_protocol = RTPROT_UNSPEC;
2640         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2641         if (rt->rt_flags & RTCF_NOTIFY)
2642                 r->rtm_flags |= RTM_F_NOTIFY;
2643         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2644         if (rt->fl.fl4_src) {
2645                 r->rtm_src_len = 32;
2646                 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2647         }
2648         if (rt->u.dst.dev)
2649                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2650 #ifdef CONFIG_NET_CLS_ROUTE
2651         if (rt->u.dst.tclassid)
2652                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2653 #endif
2654 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2655         if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2656                 __u32 alg = rt->rt_multipath_alg;
2657
2658                 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2659         }
2660 #endif
2661         if (rt->fl.iif)
2662                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2663         else if (rt->rt_src != rt->fl.fl4_src)
2664                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2665         if (rt->rt_dst != rt->rt_gateway)
2666                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2667         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2668                 goto rtattr_failure;
2669         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2670         ci.rta_used     = rt->u.dst.__use;
2671         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2672         if (rt->u.dst.expires)
2673                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2674         else
2675                 ci.rta_expires = 0;
2676         ci.rta_error    = rt->u.dst.error;
2677         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2678         if (rt->peer) {
2679                 ci.rta_id = rt->peer->ip_id_count;
2680                 if (rt->peer->tcp_ts_stamp) {
2681                         ci.rta_ts = rt->peer->tcp_ts;
2682                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2683                 }
2684         }
2685 #ifdef CONFIG_IP_MROUTE
2686         eptr = (struct rtattr*)skb->tail;
2687 #endif
2688         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2689         if (rt->fl.iif) {
2690 #ifdef CONFIG_IP_MROUTE
2691                 u32 dst = rt->rt_dst;
2692
2693                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2694                     ipv4_devconf.mc_forwarding) {
2695                         int err = ipmr_get_route(skb, r, nowait);
2696                         if (err <= 0) {
2697                                 if (!nowait) {
2698                                         if (err == 0)
2699                                                 return 0;
2700                                         goto nlmsg_failure;
2701                                 } else {
2702                                         if (err == -EMSGSIZE)
2703                                                 goto nlmsg_failure;
2704                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2705                                 }
2706                         }
2707                 } else
2708 #endif
2709                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2710         }
2711
2712         nlh->nlmsg_len = skb->tail - b;
2713         return skb->len;
2714
2715 nlmsg_failure:
2716 rtattr_failure:
2717         skb_trim(skb, b - skb->data);
2718         return -1;
2719 }
2720
2721 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2722 {
2723         struct rtattr **rta = arg;
2724         struct rtmsg *rtm = NLMSG_DATA(nlh);
2725         struct rtable *rt = NULL;
2726         u32 dst = 0;
2727         u32 src = 0;
2728         int iif = 0;
2729         int err = -ENOBUFS;
2730         struct sk_buff *skb;
2731
2732         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2733         if (!skb)
2734                 goto out;
2735
2736         /* Reserve room for dummy headers, this skb can pass
2737            through good chunk of routing engine.
2738          */
2739         skb->mac.raw = skb->data;
2740         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2741
2742         if (rta[RTA_SRC - 1])
2743                 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2744         if (rta[RTA_DST - 1])
2745                 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2746         if (rta[RTA_IIF - 1])
2747                 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2748
2749         if (iif) {
2750                 struct net_device *dev = __dev_get_by_index(iif);
2751                 err = -ENODEV;
2752                 if (!dev)
2753                         goto out_free;
2754                 skb->protocol   = htons(ETH_P_IP);
2755                 skb->dev        = dev;
2756                 local_bh_disable();
2757                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2758                 local_bh_enable();
2759                 rt = (struct rtable*)skb->dst;
2760                 if (!err && rt->u.dst.error)
2761                         err = -rt->u.dst.error;
2762         } else {
2763                 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2764                                                          .saddr = src,
2765                                                          .tos = rtm->rtm_tos } } };
2766                 int oif = 0;
2767                 if (rta[RTA_OIF - 1])
2768                         memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2769                 fl.oif = oif;
2770                 err = ip_route_output_key(&rt, &fl);
2771         }
2772         if (err)
2773                 goto out_free;
2774
2775         skb->dst = &rt->u.dst;
2776         if (rtm->rtm_flags & RTM_F_NOTIFY)
2777                 rt->rt_flags |= RTCF_NOTIFY;
2778
2779         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2780
2781         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2782                                 RTM_NEWROUTE, 0, 0);
2783         if (!err)
2784                 goto out_free;
2785         if (err < 0) {
2786                 err = -EMSGSIZE;
2787                 goto out_free;
2788         }
2789
2790         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2791         if (err > 0)
2792                 err = 0;
2793 out:    return err;
2794
2795 out_free:
2796         kfree_skb(skb);
2797         goto out;
2798 }
2799
2800 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2801 {
2802         struct rtable *rt;
2803         int h, s_h;
2804         int idx, s_idx;
2805
2806         s_h = cb->args[0];
2807         s_idx = idx = cb->args[1];
2808         for (h = 0; h <= rt_hash_mask; h++) {
2809                 if (h < s_h) continue;
2810                 if (h > s_h)
2811                         s_idx = 0;
2812                 rcu_read_lock_bh();
2813                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2814                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2815                         if (idx < s_idx)
2816                                 continue;
2817                         skb->dst = dst_clone(&rt->u.dst);
2818                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2819                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2820                                          1, NLM_F_MULTI) <= 0) {
2821                                 dst_release(xchg(&skb->dst, NULL));
2822                                 rcu_read_unlock_bh();
2823                                 goto done;
2824                         }
2825                         dst_release(xchg(&skb->dst, NULL));
2826                 }
2827                 rcu_read_unlock_bh();
2828         }
2829
2830 done:
2831         cb->args[0] = h;
2832         cb->args[1] = idx;
2833         return skb->len;
2834 }
2835
2836 void ip_rt_multicast_event(struct in_device *in_dev)
2837 {
2838         rt_cache_flush(0);
2839 }
2840
2841 #ifdef CONFIG_SYSCTL
2842 static int flush_delay;
2843
2844 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2845                                         struct file *filp, void __user *buffer,
2846                                         size_t *lenp, loff_t *ppos)
2847 {
2848         if (write) {
2849                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2850                 rt_cache_flush(flush_delay);
2851                 return 0;
2852         }
2853
2854         return -EINVAL;
2855 }
2856
2857 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2858                                                 int __user *name,
2859                                                 int nlen,
2860                                                 void __user *oldval,
2861                                                 size_t __user *oldlenp,
2862                                                 void __user *newval,
2863                                                 size_t newlen,
2864                                                 void **context)
2865 {
2866         int delay;
2867         if (newlen != sizeof(int))
2868                 return -EINVAL;
2869         if (get_user(delay, (int __user *)newval))
2870                 return -EFAULT;
2871         rt_cache_flush(delay);
2872         return 0;
2873 }
2874
2875 ctl_table ipv4_route_table[] = {
2876         {
2877                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2878                 .procname       = "flush",
2879                 .data           = &flush_delay,
2880                 .maxlen         = sizeof(int),
2881                 .mode           = 0200,
2882                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2883                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2884         },
2885         {
2886                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2887                 .procname       = "min_delay",
2888                 .data           = &ip_rt_min_delay,
2889                 .maxlen         = sizeof(int),
2890                 .mode           = 0644,
2891                 .proc_handler   = &proc_dointvec_jiffies,
2892                 .strategy       = &sysctl_jiffies,
2893         },
2894         {
2895                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2896                 .procname       = "max_delay",
2897                 .data           = &ip_rt_max_delay,
2898                 .maxlen         = sizeof(int),
2899                 .mode           = 0644,
2900                 .proc_handler   = &proc_dointvec_jiffies,
2901                 .strategy       = &sysctl_jiffies,
2902         },
2903         {
2904                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2905                 .procname       = "gc_thresh",
2906                 .data           = &ipv4_dst_ops.gc_thresh,
2907                 .maxlen         = sizeof(int),
2908                 .mode           = 0644,
2909                 .proc_handler   = &proc_dointvec,
2910         },
2911         {
2912                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2913                 .procname       = "max_size",
2914                 .data           = &ip_rt_max_size,
2915                 .maxlen         = sizeof(int),
2916                 .mode           = 0644,
2917                 .proc_handler   = &proc_dointvec,
2918         },
2919         {
2920                 /*  Deprecated. Use gc_min_interval_ms */
2921
2922                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2923                 .procname       = "gc_min_interval",
2924                 .data           = &ip_rt_gc_min_interval,
2925                 .maxlen         = sizeof(int),
2926                 .mode           = 0644,
2927                 .proc_handler   = &proc_dointvec_jiffies,
2928                 .strategy       = &sysctl_jiffies,
2929         },
2930         {
2931                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2932                 .procname       = "gc_min_interval_ms",
2933                 .data           = &ip_rt_gc_min_interval,
2934                 .maxlen         = sizeof(int),
2935                 .mode           = 0644,
2936                 .proc_handler   = &proc_dointvec_ms_jiffies,
2937                 .strategy       = &sysctl_ms_jiffies,
2938         },
2939         {
2940                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2941                 .procname       = "gc_timeout",
2942                 .data           = &ip_rt_gc_timeout,
2943                 .maxlen         = sizeof(int),
2944                 .mode           = 0644,
2945                 .proc_handler   = &proc_dointvec_jiffies,
2946                 .strategy       = &sysctl_jiffies,
2947         },
2948         {
2949                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2950                 .procname       = "gc_interval",
2951                 .data           = &ip_rt_gc_interval,
2952                 .maxlen         = sizeof(int),
2953                 .mode           = 0644,
2954                 .proc_handler   = &proc_dointvec_jiffies,
2955                 .strategy       = &sysctl_jiffies,
2956         },
2957         {
2958                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2959                 .procname       = "redirect_load",
2960                 .data           = &ip_rt_redirect_load,
2961                 .maxlen         = sizeof(int),
2962                 .mode           = 0644,
2963                 .proc_handler   = &proc_dointvec,
2964         },
2965         {
2966                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2967                 .procname       = "redirect_number",
2968                 .data           = &ip_rt_redirect_number,
2969                 .maxlen         = sizeof(int),
2970                 .mode           = 0644,
2971                 .proc_handler   = &proc_dointvec,
2972         },
2973         {
2974                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2975                 .procname       = "redirect_silence",
2976                 .data           = &ip_rt_redirect_silence,
2977                 .maxlen         = sizeof(int),
2978                 .mode           = 0644,
2979                 .proc_handler   = &proc_dointvec,
2980         },
2981         {
2982                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2983                 .procname       = "error_cost",
2984                 .data           = &ip_rt_error_cost,
2985                 .maxlen         = sizeof(int),
2986                 .mode           = 0644,
2987                 .proc_handler   = &proc_dointvec,
2988         },
2989         {
2990                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2991                 .procname       = "error_burst",
2992                 .data           = &ip_rt_error_burst,
2993                 .maxlen         = sizeof(int),
2994                 .mode           = 0644,
2995                 .proc_handler   = &proc_dointvec,
2996         },
2997         {
2998                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2999                 .procname       = "gc_elasticity",
3000                 .data           = &ip_rt_gc_elasticity,
3001                 .maxlen         = sizeof(int),
3002                 .mode           = 0644,
3003                 .proc_handler   = &proc_dointvec,
3004         },
3005         {
3006                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3007                 .procname       = "mtu_expires",
3008                 .data           = &ip_rt_mtu_expires,
3009                 .maxlen         = sizeof(int),
3010                 .mode           = 0644,
3011                 .proc_handler   = &proc_dointvec_jiffies,
3012                 .strategy       = &sysctl_jiffies,
3013         },
3014         {
3015                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3016                 .procname       = "min_pmtu",
3017                 .data           = &ip_rt_min_pmtu,
3018                 .maxlen         = sizeof(int),
3019                 .mode           = 0644,
3020                 .proc_handler   = &proc_dointvec,
3021         },
3022         {
3023                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3024                 .procname       = "min_adv_mss",
3025                 .data           = &ip_rt_min_advmss,
3026                 .maxlen         = sizeof(int),
3027                 .mode           = 0644,
3028                 .proc_handler   = &proc_dointvec,
3029         },
3030         {
3031                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3032                 .procname       = "secret_interval",
3033                 .data           = &ip_rt_secret_interval,
3034                 .maxlen         = sizeof(int),
3035                 .mode           = 0644,
3036                 .proc_handler   = &proc_dointvec_jiffies,
3037                 .strategy       = &sysctl_jiffies,
3038         },
3039         { .ctl_name = 0 }
3040 };
3041 #endif
3042
3043 #ifdef CONFIG_NET_CLS_ROUTE
3044 struct ip_rt_acct *ip_rt_acct;
3045
3046 /* This code sucks.  But you should have seen it before! --RR */
3047
3048 /* IP route accounting ptr for this logical cpu number. */
3049 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3050
3051 #ifdef CONFIG_PROC_FS
3052 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3053                            int length, int *eof, void *data)
3054 {
3055         unsigned int i;
3056
3057         if ((offset & 3) || (length & 3))
3058                 return -EIO;
3059
3060         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3061                 *eof = 1;
3062                 return 0;
3063         }
3064
3065         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3066                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3067                 *eof = 1;
3068         }
3069
3070         offset /= sizeof(u32);
3071
3072         if (length > 0) {
3073                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3074                 u32 *dst = (u32 *) buffer;
3075
3076                 /* Copy first cpu. */
3077                 *start = buffer;
3078                 memcpy(dst, src, length);
3079
3080                 /* Add the other cpus in, one int at a time */
3081                 for_each_cpu(i) {
3082                         unsigned int j;
3083
3084                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3085
3086                         for (j = 0; j < length/4; j++)
3087                                 dst[j] += src[j];
3088                 }
3089         }
3090         return length;
3091 }
3092 #endif /* CONFIG_PROC_FS */
3093 #endif /* CONFIG_NET_CLS_ROUTE */
3094
3095 static __initdata unsigned long rhash_entries;
3096 static int __init set_rhash_entries(char *str)
3097 {
3098         if (!str)
3099                 return 0;
3100         rhash_entries = simple_strtoul(str, &str, 0);
3101         return 1;
3102 }
3103 __setup("rhash_entries=", set_rhash_entries);
3104
3105 int __init ip_rt_init(void)
3106 {
3107         int rc = 0;
3108
3109         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3110                              (jiffies ^ (jiffies >> 7)));
3111
3112 #ifdef CONFIG_NET_CLS_ROUTE
3113         {
3114         int order;
3115         for (order = 0;
3116              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3117                 /* NOTHING */;
3118         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3119         if (!ip_rt_acct)
3120                 panic("IP: failed to allocate ip_rt_acct\n");
3121         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3122         }
3123 #endif
3124
3125         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3126                                                      sizeof(struct rtable),
3127                                                      0, SLAB_HWCACHE_ALIGN,
3128                                                      NULL, NULL);
3129
3130         if (!ipv4_dst_ops.kmem_cachep)
3131                 panic("IP: failed to allocate ip_dst_cache\n");
3132
3133         rt_hash_table = (struct rt_hash_bucket *)
3134                 alloc_large_system_hash("IP route cache",
3135                                         sizeof(struct rt_hash_bucket),
3136                                         rhash_entries,
3137                                         (num_physpages >= 128 * 1024) ?
3138                                                 (27 - PAGE_SHIFT) :
3139                                                 (29 - PAGE_SHIFT),
3140                                         HASH_HIGHMEM,
3141                                         &rt_hash_log,
3142                                         &rt_hash_mask,
3143                                         0);
3144         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3145         rt_hash_lock_init();
3146
3147         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3148         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3149
3150         rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3151         if (!rt_cache_stat)
3152                 return -ENOMEM;
3153
3154         devinet_init();
3155         ip_fib_init();
3156
3157         init_timer(&rt_flush_timer);
3158         rt_flush_timer.function = rt_run_flush;
3159         init_timer(&rt_periodic_timer);
3160         rt_periodic_timer.function = rt_check_expire;
3161         init_timer(&rt_secret_timer);
3162         rt_secret_timer.function = rt_secret_rebuild;
3163
3164         /* All the timers, started at system startup tend
3165            to synchronize. Perturb it a bit.
3166          */
3167         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3168                                         ip_rt_gc_interval;
3169         add_timer(&rt_periodic_timer);
3170
3171         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3172                 ip_rt_secret_interval;
3173         add_timer(&rt_secret_timer);
3174
3175 #ifdef CONFIG_PROC_FS
3176         {
3177         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3178         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3179             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3180                                              proc_net_stat))) {
3181                 free_percpu(rt_cache_stat);
3182                 return -ENOMEM;
3183         }
3184         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3185         }
3186 #ifdef CONFIG_NET_CLS_ROUTE
3187         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3188 #endif
3189 #endif
3190 #ifdef CONFIG_XFRM
3191         xfrm_init();
3192         xfrm4_init();
3193 #endif
3194         return rc;
3195 }
3196
3197 EXPORT_SYMBOL(__ip_select_ident);
3198 EXPORT_SYMBOL(ip_route_input);
3199 EXPORT_SYMBOL(ip_route_output_key);