net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  60  *
  61  *              This program is free software; you can redistribute it and/or
  62  *              modify it under the terms of the GNU General Public License
  63  *              as published by the Free Software Foundation; either version
  64  *              2 of the License, or (at your option) any later version.
  65  */
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <asm/system.h>
  70 #include <linux/bitops.h>
  71 #include <linux/types.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/mm.h>
  75 #include <linux/bootmem.h>
  76 #include <linux/string.h>
  77 #include <linux/socket.h>
  78 #include <linux/sockios.h>
  79 #include <linux/errno.h>
  80 #include <linux/in.h>
  81 #include <linux/inet.h>
  82 #include <linux/netdevice.h>
  83 #include <linux/proc_fs.h>
  84 #include <linux/init.h>
  85 #include <linux/skbuff.h>
  86 #include <linux/rtnetlink.h>
  87 #include <linux/inetdevice.h>
  88 #include <linux/igmp.h>
  89 #include <linux/pkt_sched.h>
  90 #include <linux/mroute.h>
  91 #include <linux/netfilter_ipv4.h>
  92 #include <linux/random.h>
  93 #include <linux/jhash.h>
  94 #include <linux/rcupdate.h>
  95 #include <linux/times.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/ip_mp_alg.h>
 107 #include <net/netevent.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111
 112 #define RT_FL_TOS(oldflp) \
 113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 114
 115 #define IP_MAX_MTU      0xFFF0
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_min_delay              = 2 * HZ;
 120 static int ip_rt_max_delay              = 10 * HZ;
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_interval            = 60 * HZ;
 124 static int ip_rt_gc_min_interval        = HZ / 2;
 125 static int ip_rt_redirect_number        = 9;
 126 static int ip_rt_redirect_load          = HZ / 50;
 127 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost             = HZ;
 129 static int ip_rt_error_burst            = 5 * HZ;
 130 static int ip_rt_gc_elasticity          = 8;
 131 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 132 static int ip_rt_min_pmtu               = 512 + 20 + 20;
 133 static int ip_rt_min_advmss             = 256;
 134 static int ip_rt_secret_interval        = 10 * 60 * HZ;
 135 static unsigned long rt_deadline;
 136
 137 #define RTprint(a...)   printk(KERN_DEBUG a)
 138
 139 static struct timer_list rt_flush_timer;
 140 static struct timer_list rt_periodic_timer;
 141 static struct timer_list rt_secret_timer;
 142
 143 /*
 144  *      Interface to generic destination cache.
 145  */
 146
 147 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 150                                          struct net_device *dev, int how);
 151 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 152 static void              ipv4_link_failure(struct sk_buff *skb);
 153 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 154 static int rt_garbage_collect(void);
 155
 156
 157 static struct dst_ops ipv4_dst_ops = {
 158         .family =               AF_INET,
 159         .protocol =             __constant_htons(ETH_P_IP),
 160         .gc =                   rt_garbage_collect,
 161         .check =                ipv4_dst_check,
 162         .destroy =              ipv4_dst_destroy,
 163         .ifdown =               ipv4_dst_ifdown,
 164         .negative_advice =      ipv4_negative_advice,
 165         .link_failure =         ipv4_link_failure,
 166         .update_pmtu =          ip_rt_update_pmtu,
 167         .entry_size =           sizeof(struct rtable),
 168 };
 169
 170 #define ECN_OR_COST(class)      TC_PRIO_##class
 171
 172 __u8 ip_tos2prio[16] = {
 173         TC_PRIO_BESTEFFORT,
 174         ECN_OR_COST(FILLER),
 175         TC_PRIO_BESTEFFORT,
 176         ECN_OR_COST(BESTEFFORT),
 177         TC_PRIO_BULK,
 178         ECN_OR_COST(BULK),
 179         TC_PRIO_BULK,
 180         ECN_OR_COST(BULK),
 181         TC_PRIO_INTERACTIVE,
 182         ECN_OR_COST(INTERACTIVE),
 183         TC_PRIO_INTERACTIVE,
 184         ECN_OR_COST(INTERACTIVE),
 185         TC_PRIO_INTERACTIVE_BULK,
 186         ECN_OR_COST(INTERACTIVE_BULK),
 187         TC_PRIO_INTERACTIVE_BULK,
 188         ECN_OR_COST(INTERACTIVE_BULK)
 189 };
 190
 191
 192 /*
 193  * Route cache.
 194  */
 195
 196 /* The locking scheme is rather straight forward:
 197  *
 198  * 1) Read-Copy Update protects the buckets of the central route hash.
 199  * 2) Only writers remove entries, and they hold the lock
 200  *    as they look at rtable reference counts.
 201  * 3) Only readers acquire references to rtable entries,
 202  *    they do so with atomic increments and with the
 203  *    lock held.
 204  */
 205
 206 struct rt_hash_bucket {
 207         struct rtable   *chain;
 208 };
 209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 210         defined(CONFIG_PROVE_LOCKING)
 211 /*
 212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 213  * The size of this table is a power of two and depends on the number of CPUS.
 214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 215  */
 216 #ifdef CONFIG_LOCKDEP
 217 # define RT_HASH_LOCK_SZ        256
 218 #else
 219 # if NR_CPUS >= 32
 220 #  define RT_HASH_LOCK_SZ       4096
 221 # elif NR_CPUS >= 16
 222 #  define RT_HASH_LOCK_SZ       2048
 223 # elif NR_CPUS >= 8
 224 #  define RT_HASH_LOCK_SZ       1024
 225 # elif NR_CPUS >= 4
 226 #  define RT_HASH_LOCK_SZ       512
 227 # else
 228 #  define RT_HASH_LOCK_SZ       256
 229 # endif
 230 #endif
 231
 232 static spinlock_t       *rt_hash_locks;
 233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 234 # define rt_hash_lock_init()    { \
 235                 int i; \
 236                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
 237                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
 238                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
 239                         spin_lock_init(&rt_hash_locks[i]); \
 240                 }
 241 #else
 242 # define rt_hash_lock_addr(slot) NULL
 243 # define rt_hash_lock_init()
 244 #endif
 245
 246 static struct rt_hash_bucket    *rt_hash_table;
 247 static unsigned                 rt_hash_mask;
 248 static int                      rt_hash_log;
 249 static unsigned int             rt_hash_rnd;
 250
 251 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 252 #define RT_CACHE_STAT_INC(field) \
 253         (__raw_get_cpu_var(rt_cache_stat).field++)
 254
 255 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 256                                 struct rtable **res);
 257
 258 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
 259 {
 260         return (jhash_2words(daddr, saddr, rt_hash_rnd)
 261                 & rt_hash_mask);
 262 }
 263
 264 #define rt_hash(daddr, saddr, idx) \
 265         rt_hash_code((__force u32)(__be32)(daddr),\
 266                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
 267
 268 #ifdef CONFIG_PROC_FS
 269 struct rt_cache_iter_state {
 270         int bucket;
 271 };
 272
 273 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 274 {
 275         struct rtable *r = NULL;
 276         struct rt_cache_iter_state *st = seq->private;
 277
 278         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 279                 rcu_read_lock_bh();
 280                 r = rt_hash_table[st->bucket].chain;
 281                 if (r)
 282                         break;
 283                 rcu_read_unlock_bh();
 284         }
 285         return r;
 286 }
 287
 288 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 289 {
 290         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
 291
 292         r = r->u.rt_next;
 293         while (!r) {
 294                 rcu_read_unlock_bh();
 295                 if (--st->bucket < 0)
 296                         break;
 297                 rcu_read_lock_bh();
 298                 r = rt_hash_table[st->bucket].chain;
 299         }
 300         return r;
 301 }
 302
 303 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 304 {
 305         struct rtable *r = rt_cache_get_first(seq);
 306
 307         if (r)
 308                 while (pos && (r = rt_cache_get_next(seq, r)))
 309                         --pos;
 310         return pos ? NULL : r;
 311 }
 312
 313 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 314 {
 315         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 316 }
 317
 318 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 319 {
 320         struct rtable *r = NULL;
 321
 322         if (v == SEQ_START_TOKEN)
 323                 r = rt_cache_get_first(seq);
 324         else
 325                 r = rt_cache_get_next(seq, v);
 326         ++*pos;
 327         return r;
 328 }
 329
 330 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 331 {
 332         if (v && v != SEQ_START_TOKEN)
 333                 rcu_read_unlock_bh();
 334 }
 335
 336 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 337 {
 338         if (v == SEQ_START_TOKEN)
 339                 seq_printf(seq, "%-127s\n",
 340                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 341                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 342                            "HHUptod\tSpecDst");
 343         else {
 344                 struct rtable *r = v;
 345                 char temp[256];
 346
 347                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 348                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 349                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 350                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 351                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 352                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 353                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 354                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 355                         dst_metric(&r->u.dst, RTAX_WINDOW),
 356                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 357                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 358                         r->fl.fl4_tos,
 359                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 360                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 361                                        dev_queue_xmit) : 0,
 362                         r->rt_spec_dst);
 363                 seq_printf(seq, "%-127s\n", temp);
 364         }
 365         return 0;
 366 }
 367
 368 static struct seq_operations rt_cache_seq_ops = {
 369         .start  = rt_cache_seq_start,
 370         .next   = rt_cache_seq_next,
 371         .stop   = rt_cache_seq_stop,
 372         .show   = rt_cache_seq_show,
 373 };
 374
 375 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 376 {
 377         struct seq_file *seq;
 378         int rc = -ENOMEM;
 379         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
 380
 381         if (!s)
 382                 goto out;
 383         rc = seq_open(file, &rt_cache_seq_ops);
 384         if (rc)
 385                 goto out_kfree;
 386         seq          = file->private_data;
 387         seq->private = s;
 388         memset(s, 0, sizeof(*s));
 389 out:
 390         return rc;
 391 out_kfree:
 392         kfree(s);
 393         goto out;
 394 }
 395
 396 static struct file_operations rt_cache_seq_fops = {
 397         .owner   = THIS_MODULE,
 398         .open    = rt_cache_seq_open,
 399         .read    = seq_read,
 400         .llseek  = seq_lseek,
 401         .release = seq_release_private,
 402 };
 403
 404
 405 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 406 {
 407         int cpu;
 408
 409         if (*pos == 0)
 410                 return SEQ_START_TOKEN;
 411
 412         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 413                 if (!cpu_possible(cpu))
 414                         continue;
 415                 *pos = cpu+1;
 416                 return &per_cpu(rt_cache_stat, cpu);
 417         }
 418         return NULL;
 419 }
 420
 421 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 422 {
 423         int cpu;
 424
 425         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 426                 if (!cpu_possible(cpu))
 427                         continue;
 428                 *pos = cpu+1;
 429                 return &per_cpu(rt_cache_stat, cpu);
 430         }
 431         return NULL;
 432
 433 }
 434
 435 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 436 {
 437
 438 }
 439
 440 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 441 {
 442         struct rt_cache_stat *st = v;
 443
 444         if (v == SEQ_START_TOKEN) {
 445                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 446                 return 0;
 447         }
 448
 449         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 450                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 451                    atomic_read(&ipv4_dst_ops.entries),
 452                    st->in_hit,
 453                    st->in_slow_tot,
 454                    st->in_slow_mc,
 455                    st->in_no_route,
 456                    st->in_brd,
 457                    st->in_martian_dst,
 458                    st->in_martian_src,
 459
 460                    st->out_hit,
 461                    st->out_slow_tot,
 462                    st->out_slow_mc,
 463
 464                    st->gc_total,
 465                    st->gc_ignored,
 466                    st->gc_goal_miss,
 467                    st->gc_dst_overflow,
 468                    st->in_hlist_search,
 469                    st->out_hlist_search
 470                 );
 471         return 0;
 472 }
 473
 474 static struct seq_operations rt_cpu_seq_ops = {
 475         .start  = rt_cpu_seq_start,
 476         .next   = rt_cpu_seq_next,
 477         .stop   = rt_cpu_seq_stop,
 478         .show   = rt_cpu_seq_show,
 479 };
 480
 481
 482 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 483 {
 484         return seq_open(file, &rt_cpu_seq_ops);
 485 }
 486
 487 static struct file_operations rt_cpu_seq_fops = {
 488         .owner   = THIS_MODULE,
 489         .open    = rt_cpu_seq_open,
 490         .read    = seq_read,
 491         .llseek  = seq_lseek,
 492         .release = seq_release,
 493 };
 494
 495 #endif /* CONFIG_PROC_FS */
 496
 497 static __inline__ void rt_free(struct rtable *rt)
 498 {
 499         multipath_remove(rt);
 500         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 501 }
 502
 503 static __inline__ void rt_drop(struct rtable *rt)
 504 {
 505         multipath_remove(rt);
 506         ip_rt_put(rt);
 507         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 508 }
 509
 510 static __inline__ int rt_fast_clean(struct rtable *rth)
 511 {
 512         /* Kill broadcast/multicast entries very aggresively, if they
 513            collide in hash table with more useful entries */
 514         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 515                 rth->fl.iif && rth->u.rt_next;
 516 }
 517
 518 static __inline__ int rt_valuable(struct rtable *rth)
 519 {
 520         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 521                 rth->u.dst.expires;
 522 }
 523
 524 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 525 {
 526         unsigned long age;
 527         int ret = 0;
 528
 529         if (atomic_read(&rth->u.dst.__refcnt))
 530                 goto out;
 531
 532         ret = 1;
 533         if (rth->u.dst.expires &&
 534             time_after_eq(jiffies, rth->u.dst.expires))
 535                 goto out;
 536
 537         age = jiffies - rth->u.dst.lastuse;
 538         ret = 0;
 539         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 540             (age <= tmo2 && rt_valuable(rth)))
 541                 goto out;
 542         ret = 1;
 543 out:    return ret;
 544 }
 545
 546 /* Bits of score are:
 547  * 31: very valuable
 548  * 30: not quite useless
 549  * 29..0: usage counter
 550  */
 551 static inline u32 rt_score(struct rtable *rt)
 552 {
 553         u32 score = jiffies - rt->u.dst.lastuse;
 554
 555         score = ~score & ~(3<<30);
 556
 557         if (rt_valuable(rt))
 558                 score |= (1<<31);
 559
 560         if (!rt->fl.iif ||
 561             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 562                 score |= (1<<30);
 563
 564         return score;
 565 }
 566
 567 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 568 {
 569         return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
 570                fl1->oif     == fl2->oif &&
 571                fl1->iif     == fl2->iif;
 572 }
 573
 574 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 575 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 576                                                 struct rtable *expentry,
 577                                                 int *removed_count)
 578 {
 579         int passedexpired = 0;
 580         struct rtable **nextstep = NULL;
 581         struct rtable **rthp = chain_head;
 582         struct rtable *rth;
 583
 584         if (removed_count)
 585                 *removed_count = 0;
 586
 587         while ((rth = *rthp) != NULL) {
 588                 if (rth == expentry)
 589                         passedexpired = 1;
 590
 591                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
 592                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
 593                         if (*rthp == expentry) {
 594                                 *rthp = rth->u.rt_next;
 595                                 continue;
 596                         } else {
 597                                 *rthp = rth->u.rt_next;
 598                                 rt_free(rth);
 599                                 if (removed_count)
 600                                         ++(*removed_count);
 601                         }
 602                 } else {
 603                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
 604                             passedexpired && !nextstep)
 605                                 nextstep = &rth->u.rt_next;
 606
 607                         rthp = &rth->u.rt_next;
 608                 }
 609         }
 610
 611         rt_free(expentry);
 612         if (removed_count)
 613                 ++(*removed_count);
 614
 615         return nextstep;
 616 }
 617 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 618
 619
 620 /* This runs via a timer and thus is always in BH context. */
 621 static void rt_check_expire(unsigned long dummy)
 622 {
 623         static unsigned int rover;
 624         unsigned int i = rover, goal;
 625         struct rtable *rth, **rthp;
 626         unsigned long now = jiffies;
 627         u64 mult;
 628
 629         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 630         if (ip_rt_gc_timeout > 1)
 631                 do_div(mult, ip_rt_gc_timeout);
 632         goal = (unsigned int)mult;
 633         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
 634         for (; goal > 0; goal--) {
 635                 unsigned long tmo = ip_rt_gc_timeout;
 636
 637                 i = (i + 1) & rt_hash_mask;
 638                 rthp = &rt_hash_table[i].chain;
 639
 640                 if (*rthp == 0)
 641                         continue;
 642                 spin_lock(rt_hash_lock_addr(i));
 643                 while ((rth = *rthp) != NULL) {
 644                         if (rth->u.dst.expires) {
 645                                 /* Entry is expired even if it is in use */
 646                                 if (time_before_eq(now, rth->u.dst.expires)) {
 647                                         tmo >>= 1;
 648                                         rthp = &rth->u.rt_next;
 649                                         continue;
 650                                 }
 651                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 652                                 tmo >>= 1;
 653                                 rthp = &rth->u.rt_next;
 654                                 continue;
 655                         }
 656
 657                         /* Cleanup aged off entries. */
 658 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 659                         /* remove all related balanced entries if necessary */
 660                         if (rth->u.dst.flags & DST_BALANCED) {
 661                                 rthp = rt_remove_balanced_route(
 662                                         &rt_hash_table[i].chain,
 663                                         rth, NULL);
 664                                 if (!rthp)
 665                                         break;
 666                         } else {
 667                                 *rthp = rth->u.rt_next;
 668                                 rt_free(rth);
 669                         }
 670 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 671                         *rthp = rth->u.rt_next;
 672                         rt_free(rth);
 673 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 674                 }
 675                 spin_unlock(rt_hash_lock_addr(i));
 676
 677                 /* Fallback loop breaker. */
 678                 if (time_after(jiffies, now))
 679                         break;
 680         }
 681         rover = i;
 682         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
 683 }
 684
 685 /* This can run from both BH and non-BH contexts, the latter
 686  * in the case of a forced flush event.
 687  */
 688 static void rt_run_flush(unsigned long dummy)
 689 {
 690         int i;
 691         struct rtable *rth, *next;
 692
 693         rt_deadline = 0;
 694
 695         get_random_bytes(&rt_hash_rnd, 4);
 696
 697         for (i = rt_hash_mask; i >= 0; i--) {
 698                 spin_lock_bh(rt_hash_lock_addr(i));
 699                 rth = rt_hash_table[i].chain;
 700                 if (rth)
 701                         rt_hash_table[i].chain = NULL;
 702                 spin_unlock_bh(rt_hash_lock_addr(i));
 703
 704                 for (; rth; rth = next) {
 705                         next = rth->u.rt_next;
 706                         rt_free(rth);
 707                 }
 708         }
 709 }
 710
 711 static DEFINE_SPINLOCK(rt_flush_lock);
 712
 713 void rt_cache_flush(int delay)
 714 {
 715         unsigned long now = jiffies;
 716         int user_mode = !in_softirq();
 717
 718         if (delay < 0)
 719                 delay = ip_rt_min_delay;
 720
 721         /* flush existing multipath state*/
 722         multipath_flush();
 723
 724         spin_lock_bh(&rt_flush_lock);
 725
 726         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 727                 long tmo = (long)(rt_deadline - now);
 728
 729                 /* If flush timer is already running
 730                    and flush request is not immediate (delay > 0):
 731
 732                    if deadline is not achieved, prolongate timer to "delay",
 733                    otherwise fire it at deadline time.
 734                  */
 735
 736                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 737                         tmo = 0;
 738
 739                 if (delay > tmo)
 740                         delay = tmo;
 741         }
 742
 743         if (delay <= 0) {
 744                 spin_unlock_bh(&rt_flush_lock);
 745                 rt_run_flush(0);
 746                 return;
 747         }
 748
 749         if (rt_deadline == 0)
 750                 rt_deadline = now + ip_rt_max_delay;
 751
 752         mod_timer(&rt_flush_timer, now+delay);
 753         spin_unlock_bh(&rt_flush_lock);
 754 }
 755
 756 static void rt_secret_rebuild(unsigned long dummy)
 757 {
 758         unsigned long now = jiffies;
 759
 760         rt_cache_flush(0);
 761         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 762 }
 763
 764 /*
 765    Short description of GC goals.
 766
 767    We want to build algorithm, which will keep routing cache
 768    at some equilibrium point, when number of aged off entries
 769    is kept approximately equal to newly generated ones.
 770
 771    Current expiration strength is variable "expire".
 772    We try to adjust it dynamically, so that if networking
 773    is idle expires is large enough to keep enough of warm entries,
 774    and when load increases it reduces to limit cache size.
 775  */
 776
 777 static int rt_garbage_collect(void)
 778 {
 779         static unsigned long expire = RT_GC_TIMEOUT;
 780         static unsigned long last_gc;
 781         static int rover;
 782         static int equilibrium;
 783         struct rtable *rth, **rthp;
 784         unsigned long now = jiffies;
 785         int goal;
 786
 787         /*
 788          * Garbage collection is pretty expensive,
 789          * do not make it too frequently.
 790          */
 791
 792         RT_CACHE_STAT_INC(gc_total);
 793
 794         if (now - last_gc < ip_rt_gc_min_interval &&
 795             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 796                 RT_CACHE_STAT_INC(gc_ignored);
 797                 goto out;
 798         }
 799
 800         /* Calculate number of entries, which we want to expire now. */
 801         goal = atomic_read(&ipv4_dst_ops.entries) -
 802                 (ip_rt_gc_elasticity << rt_hash_log);
 803         if (goal <= 0) {
 804                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 805                         equilibrium = ipv4_dst_ops.gc_thresh;
 806                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 807                 if (goal > 0) {
 808                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 809                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 810                 }
 811         } else {
 812                 /* We are in dangerous area. Try to reduce cache really
 813                  * aggressively.
 814                  */
 815                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 816                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 817         }
 818
 819         if (now - last_gc >= ip_rt_gc_min_interval)
 820                 last_gc = now;
 821
 822         if (goal <= 0) {
 823                 equilibrium += goal;
 824                 goto work_done;
 825         }
 826
 827         do {
 828                 int i, k;
 829
 830                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 831                         unsigned long tmo = expire;
 832
 833                         k = (k + 1) & rt_hash_mask;
 834                         rthp = &rt_hash_table[k].chain;
 835                         spin_lock_bh(rt_hash_lock_addr(k));
 836                         while ((rth = *rthp) != NULL) {
 837                                 if (!rt_may_expire(rth, tmo, expire)) {
 838                                         tmo >>= 1;
 839                                         rthp = &rth->u.rt_next;
 840                                         continue;
 841                                 }
 842 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 843                                 /* remove all related balanced entries
 844                                  * if necessary
 845                                  */
 846                                 if (rth->u.dst.flags & DST_BALANCED) {
 847                                         int r;
 848
 849                                         rthp = rt_remove_balanced_route(
 850                                                 &rt_hash_table[k].chain,
 851                                                 rth,
 852                                                 &r);
 853                                         goal -= r;
 854                                         if (!rthp)
 855                                                 break;
 856                                 } else {
 857                                         *rthp = rth->u.rt_next;
 858                                         rt_free(rth);
 859                                         goal--;
 860                                 }
 861 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 862                                 *rthp = rth->u.rt_next;
 863                                 rt_free(rth);
 864                                 goal--;
 865 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 866                         }
 867                         spin_unlock_bh(rt_hash_lock_addr(k));
 868                         if (goal <= 0)
 869                                 break;
 870                 }
 871                 rover = k;
 872
 873                 if (goal <= 0)
 874                         goto work_done;
 875
 876                 /* Goal is not achieved. We stop process if:
 877
 878                    - if expire reduced to zero. Otherwise, expire is halfed.
 879                    - if table is not full.
 880                    - if we are called from interrupt.
 881                    - jiffies check is just fallback/debug loop breaker.
 882                      We will not spin here for long time in any case.
 883                  */
 884
 885                 RT_CACHE_STAT_INC(gc_goal_miss);
 886
 887                 if (expire == 0)
 888                         break;
 889
 890                 expire >>= 1;
 891 #if RT_CACHE_DEBUG >= 2
 892                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 893                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 894 #endif
 895
 896                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 897                         goto out;
 898         } while (!in_softirq() && time_before_eq(jiffies, now));
 899
 900         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 901                 goto out;
 902         if (net_ratelimit())
 903                 printk(KERN_WARNING "dst cache overflow\n");
 904         RT_CACHE_STAT_INC(gc_dst_overflow);
 905         return 1;
 906
 907 work_done:
 908         expire += ip_rt_gc_min_interval;
 909         if (expire > ip_rt_gc_timeout ||
 910             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 911                 expire = ip_rt_gc_timeout;
 912 #if RT_CACHE_DEBUG >= 2
 913         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 914                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 915 #endif
 916 out:    return 0;
 917 }
 918
 919 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 920 {
 921         struct rtable   *rth, **rthp;
 922         unsigned long   now;
 923         struct rtable *cand, **candp;
 924         u32             min_score;
 925         int             chain_length;
 926         int attempts = !in_softirq();
 927
 928 restart:
 929         chain_length = 0;
 930         min_score = ~(u32)0;
 931         cand = NULL;
 932         candp = NULL;
 933         now = jiffies;
 934
 935         rthp = &rt_hash_table[hash].chain;
 936
 937         spin_lock_bh(rt_hash_lock_addr(hash));
 938         while ((rth = *rthp) != NULL) {
 939 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
 940                 if (!(rth->u.dst.flags & DST_BALANCED) &&
 941                     compare_keys(&rth->fl, &rt->fl)) {
 942 #else
 943                 if (compare_keys(&rth->fl, &rt->fl)) {
 944 #endif
 945                         /* Put it first */
 946                         *rthp = rth->u.rt_next;
 947                         /*
 948                          * Since lookup is lockfree, the deletion
 949                          * must be visible to another weakly ordered CPU before
 950                          * the insertion at the start of the hash chain.
 951                          */
 952                         rcu_assign_pointer(rth->u.rt_next,
 953                                            rt_hash_table[hash].chain);
 954                         /*
 955                          * Since lookup is lockfree, the update writes
 956                          * must be ordered for consistency on SMP.
 957                          */
 958                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 959
 960                         rth->u.dst.__use++;
 961                         dst_hold(&rth->u.dst);
 962                         rth->u.dst.lastuse = now;
 963                         spin_unlock_bh(rt_hash_lock_addr(hash));
 964
 965                         rt_drop(rt);
 966                         *rp = rth;
 967                         return 0;
 968                 }
 969
 970                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 971                         u32 score = rt_score(rth);
 972
 973                         if (score <= min_score) {
 974                                 cand = rth;
 975                                 candp = rthp;
 976                                 min_score = score;
 977                         }
 978                 }
 979
 980                 chain_length++;
 981
 982                 rthp = &rth->u.rt_next;
 983         }
 984
 985         if (cand) {
 986                 /* ip_rt_gc_elasticity used to be average length of chain
 987                  * length, when exceeded gc becomes really aggressive.
 988                  *
 989                  * The second limit is less certain. At the moment it allows
 990                  * only 2 entries per bucket. We will see.
 991                  */
 992                 if (chain_length > ip_rt_gc_elasticity) {
 993                         *candp = cand->u.rt_next;
 994                         rt_free(cand);
 995                 }
 996         }
 997
 998         /* Try to bind route to arp only if it is output
 999            route or unicast forwarding path.
1000          */
1001         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1002                 int err = arp_bind_neighbour(&rt->u.dst);
1003                 if (err) {
1004                         spin_unlock_bh(rt_hash_lock_addr(hash));
1005
1006                         if (err != -ENOBUFS) {
1007                                 rt_drop(rt);
1008                                 return err;
1009                         }
1010
1011                         /* Neighbour tables are full and nothing
1012                            can be released. Try to shrink route cache,
1013                            it is most likely it holds some neighbour records.
1014                          */
1015                         if (attempts-- > 0) {
1016                                 int saved_elasticity = ip_rt_gc_elasticity;
1017                                 int saved_int = ip_rt_gc_min_interval;
1018                                 ip_rt_gc_elasticity     = 1;
1019                                 ip_rt_gc_min_interval   = 0;
1020                                 rt_garbage_collect();
1021                                 ip_rt_gc_min_interval   = saved_int;
1022                                 ip_rt_gc_elasticity     = saved_elasticity;
1023                                 goto restart;
1024                         }
1025
1026                         if (net_ratelimit())
1027                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1028                         rt_drop(rt);
1029                         return -ENOBUFS;
1030                 }
1031         }
1032
1033         rt->u.rt_next = rt_hash_table[hash].chain;
1034 #if RT_CACHE_DEBUG >= 2
1035         if (rt->u.rt_next) {
1036                 struct rtable *trt;
1037                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1038                        NIPQUAD(rt->rt_dst));
1039                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1040                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1041                 printk("\n");
1042         }
1043 #endif
1044         rt_hash_table[hash].chain = rt;
1045         spin_unlock_bh(rt_hash_lock_addr(hash));
1046         *rp = rt;
1047         return 0;
1048 }
1049
1050 void rt_bind_peer(struct rtable *rt, int create)
1051 {
1052         static DEFINE_SPINLOCK(rt_peer_lock);
1053         struct inet_peer *peer;
1054
1055         peer = inet_getpeer(rt->rt_dst, create);
1056
1057         spin_lock_bh(&rt_peer_lock);
1058         if (rt->peer == NULL) {
1059                 rt->peer = peer;
1060                 peer = NULL;
1061         }
1062         spin_unlock_bh(&rt_peer_lock);
1063         if (peer)
1064                 inet_putpeer(peer);
1065 }
1066
1067 /*
1068  * Peer allocation may fail only in serious out-of-memory conditions.  However
1069  * we still can generate some output.
1070  * Random ID selection looks a bit dangerous because we have no chances to
1071  * select ID being unique in a reasonable period of time.
1072  * But broken packet identifier may be better than no packet at all.
1073  */
1074 static void ip_select_fb_ident(struct iphdr *iph)
1075 {
1076         static DEFINE_SPINLOCK(ip_fb_id_lock);
1077         static u32 ip_fallback_id;
1078         u32 salt;
1079
1080         spin_lock_bh(&ip_fb_id_lock);
1081         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1082         iph->id = htons(salt & 0xFFFF);
1083         ip_fallback_id = salt;
1084         spin_unlock_bh(&ip_fb_id_lock);
1085 }
1086
1087 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1088 {
1089         struct rtable *rt = (struct rtable *) dst;
1090
1091         if (rt) {
1092                 if (rt->peer == NULL)
1093                         rt_bind_peer(rt, 1);
1094
1095                 /* If peer is attached to destination, it is never detached,
1096                    so that we need not to grab a lock to dereference it.
1097                  */
1098                 if (rt->peer) {
1099                         iph->id = htons(inet_getid(rt->peer, more));
1100                         return;
1101                 }
1102         } else
1103                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1104                        __builtin_return_address(0));
1105
1106         ip_select_fb_ident(iph);
1107 }
1108
1109 static void rt_del(unsigned hash, struct rtable *rt)
1110 {
1111         struct rtable **rthp;
1112
1113         spin_lock_bh(rt_hash_lock_addr(hash));
1114         ip_rt_put(rt);
1115         for (rthp = &rt_hash_table[hash].chain; *rthp;
1116              rthp = &(*rthp)->u.rt_next)
1117                 if (*rthp == rt) {
1118                         *rthp = rt->u.rt_next;
1119                         rt_free(rt);
1120                         break;
1121                 }
1122         spin_unlock_bh(rt_hash_lock_addr(hash));
1123 }
1124
1125 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1126                     __be32 saddr, struct net_device *dev)
1127 {
1128         int i, k;
1129         struct in_device *in_dev = in_dev_get(dev);
1130         struct rtable *rth, **rthp;
1131         __be32  skeys[2] = { saddr, 0 };
1132         int  ikeys[2] = { dev->ifindex, 0 };
1133         struct netevent_redirect netevent;
1134
1135         if (!in_dev)
1136                 return;
1137
1138         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1139             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1140                 goto reject_redirect;
1141
1142         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1143                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1144                         goto reject_redirect;
1145                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1146                         goto reject_redirect;
1147         } else {
1148                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1149                         goto reject_redirect;
1150         }
1151
1152         for (i = 0; i < 2; i++) {
1153                 for (k = 0; k < 2; k++) {
1154                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1155
1156                         rthp=&rt_hash_table[hash].chain;
1157
1158                         rcu_read_lock();
1159                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1160                                 struct rtable *rt;
1161
1162                                 if (rth->fl.fl4_dst != daddr ||
1163                                     rth->fl.fl4_src != skeys[i] ||
1164                                     rth->fl.oif != ikeys[k] ||
1165                                     rth->fl.iif != 0) {
1166                                         rthp = &rth->u.rt_next;
1167                                         continue;
1168                                 }
1169
1170                                 if (rth->rt_dst != daddr ||
1171                                     rth->rt_src != saddr ||
1172                                     rth->u.dst.error ||
1173                                     rth->rt_gateway != old_gw ||
1174                                     rth->u.dst.dev != dev)
1175                                         break;
1176
1177                                 dst_hold(&rth->u.dst);
1178                                 rcu_read_unlock();
1179
1180                                 rt = dst_alloc(&ipv4_dst_ops);
1181                                 if (rt == NULL) {
1182                                         ip_rt_put(rth);
1183                                         in_dev_put(in_dev);
1184                                         return;
1185                                 }
1186
1187                                 /* Copy all the information. */
1188                                 *rt = *rth;
1189                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1190                                 rt->u.dst.__use         = 1;
1191                                 atomic_set(&rt->u.dst.__refcnt, 1);
1192                                 rt->u.dst.child         = NULL;
1193                                 if (rt->u.dst.dev)
1194                                         dev_hold(rt->u.dst.dev);
1195                                 if (rt->idev)
1196                                         in_dev_hold(rt->idev);
1197                                 rt->u.dst.obsolete      = 0;
1198                                 rt->u.dst.lastuse       = jiffies;
1199                                 rt->u.dst.path          = &rt->u.dst;
1200                                 rt->u.dst.neighbour     = NULL;
1201                                 rt->u.dst.hh            = NULL;
1202                                 rt->u.dst.xfrm          = NULL;
1203
1204                                 rt->rt_flags            |= RTCF_REDIRECTED;
1205
1206                                 /* Gateway is different ... */
1207                                 rt->rt_gateway          = new_gw;
1208
1209                                 /* Redirect received -> path was valid */
1210                                 dst_confirm(&rth->u.dst);
1211
1212                                 if (rt->peer)
1213                                         atomic_inc(&rt->peer->refcnt);
1214
1215                                 if (arp_bind_neighbour(&rt->u.dst) ||
1216                                     !(rt->u.dst.neighbour->nud_state &
1217                                             NUD_VALID)) {
1218                                         if (rt->u.dst.neighbour)
1219                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1220                                         ip_rt_put(rth);
1221                                         rt_drop(rt);
1222                                         goto do_next;
1223                                 }
1224
1225                                 netevent.old = &rth->u.dst;
1226                                 netevent.new = &rt->u.dst;
1227                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1228                                                         &netevent);
1229
1230                                 rt_del(hash, rth);
1231                                 if (!rt_intern_hash(hash, rt, &rt))
1232                                         ip_rt_put(rt);
1233                                 goto do_next;
1234                         }
1235                         rcu_read_unlock();
1236                 do_next:
1237                         ;
1238                 }
1239         }
1240         in_dev_put(in_dev);
1241         return;
1242
1243 reject_redirect:
1244 #ifdef CONFIG_IP_ROUTE_VERBOSE
1245         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1246                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1247                         "%u.%u.%u.%u ignored.\n"
1248                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1249                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1250                        NIPQUAD(saddr), NIPQUAD(daddr));
1251 #endif
1252         in_dev_put(in_dev);
1253 }
1254
1255 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1256 {
1257         struct rtable *rt = (struct rtable*)dst;
1258         struct dst_entry *ret = dst;
1259
1260         if (rt) {
1261                 if (dst->obsolete) {
1262                         ip_rt_put(rt);
1263                         ret = NULL;
1264                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1265                            rt->u.dst.expires) {
1266                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1267                                                 rt->fl.oif);
1268 #if RT_CACHE_DEBUG >= 1
1269                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1270                                           "%u.%u.%u.%u/%02x dropped\n",
1271                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1272 #endif
1273                         rt_del(hash, rt);
1274                         ret = NULL;
1275                 }
1276         }
1277         return ret;
1278 }
1279
1280 /*
1281  * Algorithm:
1282  *      1. The first ip_rt_redirect_number redirects are sent
1283  *         with exponential backoff, then we stop sending them at all,
1284  *         assuming that the host ignores our redirects.
1285  *      2. If we did not see packets requiring redirects
1286  *         during ip_rt_redirect_silence, we assume that the host
1287  *         forgot redirected route and start to send redirects again.
1288  *
1289  * This algorithm is much cheaper and more intelligent than dumb load limiting
1290  * in icmp.c.
1291  *
1292  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1293  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1294  */
1295
1296 void ip_rt_send_redirect(struct sk_buff *skb)
1297 {
1298         struct rtable *rt = (struct rtable*)skb->dst;
1299         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1300
1301         if (!in_dev)
1302                 return;
1303
1304         if (!IN_DEV_TX_REDIRECTS(in_dev))
1305                 goto out;
1306
1307         /* No redirected packets during ip_rt_redirect_silence;
1308          * reset the algorithm.
1309          */
1310         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1311                 rt->u.dst.rate_tokens = 0;
1312
1313         /* Too many ignored redirects; do not send anything
1314          * set u.dst.rate_last to the last seen redirected packet.
1315          */
1316         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1317                 rt->u.dst.rate_last = jiffies;
1318                 goto out;
1319         }
1320
1321         /* Check for load limit; set rate_last to the latest sent
1322          * redirect.
1323          */
1324         if (time_after(jiffies,
1325                        (rt->u.dst.rate_last +
1326                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1327                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1328                 rt->u.dst.rate_last = jiffies;
1329                 ++rt->u.dst.rate_tokens;
1330 #ifdef CONFIG_IP_ROUTE_VERBOSE
1331                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1332                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1333                     net_ratelimit())
1334                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1335                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1336                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1337                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1338 #endif
1339         }
1340 out:
1341         in_dev_put(in_dev);
1342 }
1343
1344 static int ip_error(struct sk_buff *skb)
1345 {
1346         struct rtable *rt = (struct rtable*)skb->dst;
1347         unsigned long now;
1348         int code;
1349
1350         switch (rt->u.dst.error) {
1351                 case EINVAL:
1352                 default:
1353                         goto out;
1354                 case EHOSTUNREACH:
1355                         code = ICMP_HOST_UNREACH;
1356                         break;
1357                 case ENETUNREACH:
1358                         code = ICMP_NET_UNREACH;
1359                         break;
1360                 case EACCES:
1361                         code = ICMP_PKT_FILTERED;
1362                         break;
1363         }
1364
1365         now = jiffies;
1366         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1367         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1368                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1369         rt->u.dst.rate_last = now;
1370         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1371                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1372                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1373         }
1374
1375 out:    kfree_skb(skb);
1376         return 0;
1377 }
1378
1379 /*
1380  *      The last two values are not from the RFC but
1381  *      are needed for AMPRnet AX.25 paths.
1382  */
1383
1384 static const unsigned short mtu_plateau[] =
1385 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1386
1387 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1388 {
1389         int i;
1390
1391         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1392                 if (old_mtu > mtu_plateau[i])
1393                         return mtu_plateau[i];
1394         return 68;
1395 }
1396
1397 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1398 {
1399         int i;
1400         unsigned short old_mtu = ntohs(iph->tot_len);
1401         struct rtable *rth;
1402         u32  skeys[2] = { iph->saddr, 0, };
1403         u32  daddr = iph->daddr;
1404         unsigned short est_mtu = 0;
1405
1406         if (ipv4_config.no_pmtu_disc)
1407                 return 0;
1408
1409         for (i = 0; i < 2; i++) {
1410                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1411
1412                 rcu_read_lock();
1413                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1414                      rth = rcu_dereference(rth->u.rt_next)) {
1415                         if (rth->fl.fl4_dst == daddr &&
1416                             rth->fl.fl4_src == skeys[i] &&
1417                             rth->rt_dst  == daddr &&
1418                             rth->rt_src  == iph->saddr &&
1419                             rth->fl.iif == 0 &&
1420                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1421                                 unsigned short mtu = new_mtu;
1422
1423                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1424
1425                                         /* BSD 4.2 compatibility hack :-( */
1426                                         if (mtu == 0 &&
1427                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1428                                             old_mtu >= 68 + (iph->ihl << 2))
1429                                                 old_mtu -= iph->ihl << 2;
1430
1431                                         mtu = guess_mtu(old_mtu);
1432                                 }
1433                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1434                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1435                                                 dst_confirm(&rth->u.dst);
1436                                                 if (mtu < ip_rt_min_pmtu) {
1437                                                         mtu = ip_rt_min_pmtu;
1438                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1439                                                                 (1 << RTAX_MTU);
1440                                                 }
1441                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1442                                                 dst_set_expires(&rth->u.dst,
1443                                                         ip_rt_mtu_expires);
1444                                         }
1445                                         est_mtu = mtu;
1446                                 }
1447                         }
1448                 }
1449                 rcu_read_unlock();
1450         }
1451         return est_mtu ? : new_mtu;
1452 }
1453
1454 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1455 {
1456         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1457             !(dst_metric_locked(dst, RTAX_MTU))) {
1458                 if (mtu < ip_rt_min_pmtu) {
1459                         mtu = ip_rt_min_pmtu;
1460                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1461                 }
1462                 dst->metrics[RTAX_MTU-1] = mtu;
1463                 dst_set_expires(dst, ip_rt_mtu_expires);
1464                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1465         }
1466 }
1467
1468 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1469 {
1470         return NULL;
1471 }
1472
1473 static void ipv4_dst_destroy(struct dst_entry *dst)
1474 {
1475         struct rtable *rt = (struct rtable *) dst;
1476         struct inet_peer *peer = rt->peer;
1477         struct in_device *idev = rt->idev;
1478
1479         if (peer) {
1480                 rt->peer = NULL;
1481                 inet_putpeer(peer);
1482         }
1483
1484         if (idev) {
1485                 rt->idev = NULL;
1486                 in_dev_put(idev);
1487         }
1488 }
1489
1490 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1491                             int how)
1492 {
1493         struct rtable *rt = (struct rtable *) dst;
1494         struct in_device *idev = rt->idev;
1495         if (dev != &loopback_dev && idev && idev->dev == dev) {
1496                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1497                 if (loopback_idev) {
1498                         rt->idev = loopback_idev;
1499                         in_dev_put(idev);
1500                 }
1501         }
1502 }
1503
1504 static void ipv4_link_failure(struct sk_buff *skb)
1505 {
1506         struct rtable *rt;
1507
1508         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1509
1510         rt = (struct rtable *) skb->dst;
1511         if (rt)
1512                 dst_set_expires(&rt->u.dst, 0);
1513 }
1514
1515 static int ip_rt_bug(struct sk_buff *skb)
1516 {
1517         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1518                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1519                 skb->dev ? skb->dev->name : "?");
1520         kfree_skb(skb);
1521         return 0;
1522 }
1523
1524 /*
1525    We do not cache source address of outgoing interface,
1526    because it is used only by IP RR, TS and SRR options,
1527    so that it out of fast path.
1528
1529    BTW remember: "addr" is allowed to be not aligned
1530    in IP options!
1531  */
1532
1533 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1534 {
1535         __be32 src;
1536         struct fib_result res;
1537
1538         if (rt->fl.iif == 0)
1539                 src = rt->rt_src;
1540         else if (fib_lookup(&rt->fl, &res) == 0) {
1541                 src = FIB_RES_PREFSRC(res);
1542                 fib_res_put(&res);
1543         } else
1544                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1545                                         RT_SCOPE_UNIVERSE);
1546         memcpy(addr, &src, 4);
1547 }
1548
1549 #ifdef CONFIG_NET_CLS_ROUTE
1550 static void set_class_tag(struct rtable *rt, u32 tag)
1551 {
1552         if (!(rt->u.dst.tclassid & 0xFFFF))
1553                 rt->u.dst.tclassid |= tag & 0xFFFF;
1554         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1555                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1556 }
1557 #endif
1558
1559 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1560 {
1561         struct fib_info *fi = res->fi;
1562
1563         if (fi) {
1564                 if (FIB_RES_GW(*res) &&
1565                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1566                         rt->rt_gateway = FIB_RES_GW(*res);
1567                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1568                        sizeof(rt->u.dst.metrics));
1569                 if (fi->fib_mtu == 0) {
1570                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1571                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1572                             rt->rt_gateway != rt->rt_dst &&
1573                             rt->u.dst.dev->mtu > 576)
1574                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1575                 }
1576 #ifdef CONFIG_NET_CLS_ROUTE
1577                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1578 #endif
1579         } else
1580                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1581
1582         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1583                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1584         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1585                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1586         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1587                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1588                                        ip_rt_min_advmss);
1589         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1590                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1591
1592 #ifdef CONFIG_NET_CLS_ROUTE
1593 #ifdef CONFIG_IP_MULTIPLE_TABLES
1594         set_class_tag(rt, fib_rules_tclass(res));
1595 #endif
1596         set_class_tag(rt, itag);
1597 #endif
1598         rt->rt_type = res->type;
1599 }
1600
1601 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1602                                 u8 tos, struct net_device *dev, int our)
1603 {
1604         unsigned hash;
1605         struct rtable *rth;
1606         __be32 spec_dst;
1607         struct in_device *in_dev = in_dev_get(dev);
1608         u32 itag = 0;
1609
1610         /* Primary sanity checks. */
1611
1612         if (in_dev == NULL)
1613                 return -EINVAL;
1614
1615         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1616             skb->protocol != htons(ETH_P_IP))
1617                 goto e_inval;
1618
1619         if (ZERONET(saddr)) {
1620                 if (!LOCAL_MCAST(daddr))
1621                         goto e_inval;
1622                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1623         } else if (fib_validate_source(saddr, 0, tos, 0,
1624                                         dev, &spec_dst, &itag) < 0)
1625                 goto e_inval;
1626
1627         rth = dst_alloc(&ipv4_dst_ops);
1628         if (!rth)
1629                 goto e_nobufs;
1630
1631         rth->u.dst.output= ip_rt_bug;
1632
1633         atomic_set(&rth->u.dst.__refcnt, 1);
1634         rth->u.dst.flags= DST_HOST;
1635         if (in_dev->cnf.no_policy)
1636                 rth->u.dst.flags |= DST_NOPOLICY;
1637         rth->fl.fl4_dst = daddr;
1638         rth->rt_dst     = daddr;
1639         rth->fl.fl4_tos = tos;
1640 #ifdef CONFIG_IP_ROUTE_FWMARK
1641         rth->fl.fl4_fwmark= skb->nfmark;
1642 #endif
1643         rth->fl.fl4_src = saddr;
1644         rth->rt_src     = saddr;
1645 #ifdef CONFIG_NET_CLS_ROUTE
1646         rth->u.dst.tclassid = itag;
1647 #endif
1648         rth->rt_iif     =
1649         rth->fl.iif     = dev->ifindex;
1650         rth->u.dst.dev  = &loopback_dev;
1651         dev_hold(rth->u.dst.dev);
1652         rth->idev       = in_dev_get(rth->u.dst.dev);
1653         rth->fl.oif     = 0;
1654         rth->rt_gateway = daddr;
1655         rth->rt_spec_dst= spec_dst;
1656         rth->rt_type    = RTN_MULTICAST;
1657         rth->rt_flags   = RTCF_MULTICAST;
1658         if (our) {
1659                 rth->u.dst.input= ip_local_deliver;
1660                 rth->rt_flags |= RTCF_LOCAL;
1661         }
1662
1663 #ifdef CONFIG_IP_MROUTE
1664         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1665                 rth->u.dst.input = ip_mr_input;
1666 #endif
1667         RT_CACHE_STAT_INC(in_slow_mc);
1668
1669         in_dev_put(in_dev);
1670         hash = rt_hash(daddr, saddr, dev->ifindex);
1671         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1672
1673 e_nobufs:
1674         in_dev_put(in_dev);
1675         return -ENOBUFS;
1676
1677 e_inval:
1678         in_dev_put(in_dev);
1679         return -EINVAL;
1680 }
1681
1682
1683 static void ip_handle_martian_source(struct net_device *dev,
1684                                      struct in_device *in_dev,
1685                                      struct sk_buff *skb,
1686                                      __be32 daddr,
1687                                      __be32 saddr)
1688 {
1689         RT_CACHE_STAT_INC(in_martian_src);
1690 #ifdef CONFIG_IP_ROUTE_VERBOSE
1691         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1692                 /*
1693                  *      RFC1812 recommendation, if source is martian,
1694                  *      the only hint is MAC header.
1695                  */
1696                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1697                         "%u.%u.%u.%u, on dev %s\n",
1698                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1699                 if (dev->hard_header_len && skb->mac.raw) {
1700                         int i;
1701                         unsigned char *p = skb->mac.raw;
1702                         printk(KERN_WARNING "ll header: ");
1703                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1704                                 printk("%02x", *p);
1705                                 if (i < (dev->hard_header_len - 1))
1706                                         printk(":");
1707                         }
1708                         printk("\n");
1709                 }
1710         }
1711 #endif
1712 }
1713
1714 static inline int __mkroute_input(struct sk_buff *skb,
1715                                   struct fib_result* res,
1716                                   struct in_device *in_dev,
1717                                   __be32 daddr, __be32 saddr, u32 tos,
1718                                   struct rtable **result)
1719 {
1720
1721         struct rtable *rth;
1722         int err;
1723         struct in_device *out_dev;
1724         unsigned flags = 0;
1725         u32 spec_dst, itag;
1726
1727         /* get a working reference to the output device */
1728         out_dev = in_dev_get(FIB_RES_DEV(*res));
1729         if (out_dev == NULL) {
1730                 if (net_ratelimit())
1731                         printk(KERN_CRIT "Bug in ip_route_input" \
1732                                "_slow(). Please, report\n");
1733                 return -EINVAL;
1734         }
1735
1736
1737         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1738                                   in_dev->dev, &spec_dst, &itag);
1739         if (err < 0) {
1740                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1741                                          saddr);
1742
1743                 err = -EINVAL;
1744                 goto cleanup;
1745         }
1746
1747         if (err)
1748                 flags |= RTCF_DIRECTSRC;
1749
1750         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1751             (IN_DEV_SHARED_MEDIA(out_dev) ||
1752              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1753                 flags |= RTCF_DOREDIRECT;
1754
1755         if (skb->protocol != htons(ETH_P_IP)) {
1756                 /* Not IP (i.e. ARP). Do not create route, if it is
1757                  * invalid for proxy arp. DNAT routes are always valid.
1758                  */
1759                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1760                         err = -EINVAL;
1761                         goto cleanup;
1762                 }
1763         }
1764
1765
1766         rth = dst_alloc(&ipv4_dst_ops);
1767         if (!rth) {
1768                 err = -ENOBUFS;
1769                 goto cleanup;
1770         }
1771
1772         atomic_set(&rth->u.dst.__refcnt, 1);
1773         rth->u.dst.flags= DST_HOST;
1774 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1775         if (res->fi->fib_nhs > 1)
1776                 rth->u.dst.flags |= DST_BALANCED;
1777 #endif
1778         if (in_dev->cnf.no_policy)
1779                 rth->u.dst.flags |= DST_NOPOLICY;
1780         if (in_dev->cnf.no_xfrm)
1781                 rth->u.dst.flags |= DST_NOXFRM;
1782         rth->fl.fl4_dst = daddr;
1783         rth->rt_dst     = daddr;
1784         rth->fl.fl4_tos = tos;
1785 #ifdef CONFIG_IP_ROUTE_FWMARK
1786         rth->fl.fl4_fwmark= skb->nfmark;
1787 #endif
1788         rth->fl.fl4_src = saddr;
1789         rth->rt_src     = saddr;
1790         rth->rt_gateway = daddr;
1791         rth->rt_iif     =
1792                 rth->fl.iif     = in_dev->dev->ifindex;
1793         rth->u.dst.dev  = (out_dev)->dev;
1794         dev_hold(rth->u.dst.dev);
1795         rth->idev       = in_dev_get(rth->u.dst.dev);
1796         rth->fl.oif     = 0;
1797         rth->rt_spec_dst= spec_dst;
1798
1799         rth->u.dst.input = ip_forward;
1800         rth->u.dst.output = ip_output;
1801
1802         rt_set_nexthop(rth, res, itag);
1803
1804         rth->rt_flags = flags;
1805
1806         *result = rth;
1807         err = 0;
1808  cleanup:
1809         /* release the working reference to the output device */
1810         in_dev_put(out_dev);
1811         return err;
1812 }
1813
1814 static inline int ip_mkroute_input_def(struct sk_buff *skb,
1815                                        struct fib_result* res,
1816                                        const struct flowi *fl,
1817                                        struct in_device *in_dev,
1818                                        __be32 daddr, __be32 saddr, u32 tos)
1819 {
1820         struct rtable* rth = NULL;
1821         int err;
1822         unsigned hash;
1823
1824 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1825         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1826                 fib_select_multipath(fl, res);
1827 #endif
1828
1829         /* create a routing cache entry */
1830         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1831         if (err)
1832                 return err;
1833
1834         /* put it into the cache */
1835         hash = rt_hash(daddr, saddr, fl->iif);
1836         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1837 }
1838
1839 static inline int ip_mkroute_input(struct sk_buff *skb,
1840                                    struct fib_result* res,
1841                                    const struct flowi *fl,
1842                                    struct in_device *in_dev,
1843                                    __be32 daddr, __be32 saddr, u32 tos)
1844 {
1845 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1846         struct rtable* rth = NULL, *rtres;
1847         unsigned char hop, hopcount;
1848         int err = -EINVAL;
1849         unsigned int hash;
1850
1851         if (res->fi)
1852                 hopcount = res->fi->fib_nhs;
1853         else
1854                 hopcount = 1;
1855
1856         /* distinguish between multipath and singlepath */
1857         if (hopcount < 2)
1858                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1859                                             saddr, tos);
1860
1861         /* add all alternatives to the routing cache */
1862         for (hop = 0; hop < hopcount; hop++) {
1863                 res->nh_sel = hop;
1864
1865                 /* put reference to previous result */
1866                 if (hop)
1867                         ip_rt_put(rtres);
1868
1869                 /* create a routing cache entry */
1870                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1871                                       &rth);
1872                 if (err)
1873                         return err;
1874
1875                 /* put it into the cache */
1876                 hash = rt_hash(daddr, saddr, fl->iif);
1877                 err = rt_intern_hash(hash, rth, &rtres);
1878                 if (err)
1879                         return err;
1880
1881                 /* forward hop information to multipath impl. */
1882                 multipath_set_nhinfo(rth,
1883                                      FIB_RES_NETWORK(*res),
1884                                      FIB_RES_NETMASK(*res),
1885                                      res->prefixlen,
1886                                      &FIB_RES_NH(*res));
1887         }
1888         skb->dst = &rtres->u.dst;
1889         return err;
1890 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1891         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1892 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1893 }
1894
1895
1896 /*
1897  *      NOTE. We drop all the packets that has local source
1898  *      addresses, because every properly looped back packet
1899  *      must have correct destination already attached by output routine.
1900  *
1901  *      Such approach solves two big problems:
1902  *      1. Not simplex devices are handled properly.
1903  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1904  */
1905
1906 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1907                                u8 tos, struct net_device *dev)
1908 {
1909         struct fib_result res;
1910         struct in_device *in_dev = in_dev_get(dev);
1911         struct flowi fl = { .nl_u = { .ip4_u =
1912                                       { .daddr = daddr,
1913                                         .saddr = saddr,
1914                                         .tos = tos,
1915                                         .scope = RT_SCOPE_UNIVERSE,
1916 #ifdef CONFIG_IP_ROUTE_FWMARK
1917                                         .fwmark = skb->nfmark
1918 #endif
1919                                       } },
1920                             .iif = dev->ifindex };
1921         unsigned        flags = 0;
1922         u32             itag = 0;
1923         struct rtable * rth;
1924         unsigned        hash;
1925         __be32          spec_dst;
1926         int             err = -EINVAL;
1927         int             free_res = 0;
1928
1929         /* IP on this device is disabled. */
1930
1931         if (!in_dev)
1932                 goto out;
1933
1934         /* Check for the most weird martians, which can be not detected
1935            by fib_lookup.
1936          */
1937
1938         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1939                 goto martian_source;
1940
1941         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1942                 goto brd_input;
1943
1944         /* Accept zero addresses only to limited broadcast;
1945          * I even do not know to fix it or not. Waiting for complains :-)
1946          */
1947         if (ZERONET(saddr))
1948                 goto martian_source;
1949
1950         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1951                 goto martian_destination;
1952
1953         /*
1954          *      Now we are ready to route packet.
1955          */
1956         if ((err = fib_lookup(&fl, &res)) != 0) {
1957                 if (!IN_DEV_FORWARD(in_dev))
1958                         goto e_hostunreach;
1959                 goto no_route;
1960         }
1961         free_res = 1;
1962
1963         RT_CACHE_STAT_INC(in_slow_tot);
1964
1965         if (res.type == RTN_BROADCAST)
1966                 goto brd_input;
1967
1968         if (res.type == RTN_LOCAL) {
1969                 int result;
1970                 result = fib_validate_source(saddr, daddr, tos,
1971                                              loopback_dev.ifindex,
1972                                              dev, &spec_dst, &itag);
1973                 if (result < 0)
1974                         goto martian_source;
1975                 if (result)
1976                         flags |= RTCF_DIRECTSRC;
1977                 spec_dst = daddr;
1978                 goto local_input;
1979         }
1980
1981         if (!IN_DEV_FORWARD(in_dev))
1982                 goto e_hostunreach;
1983         if (res.type != RTN_UNICAST)
1984                 goto martian_destination;
1985
1986         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1987         if (err == -ENOBUFS)
1988                 goto e_nobufs;
1989         if (err == -EINVAL)
1990                 goto e_inval;
1991
1992 done:
1993         in_dev_put(in_dev);
1994         if (free_res)
1995                 fib_res_put(&res);
1996 out:    return err;
1997
1998 brd_input:
1999         if (skb->protocol != htons(ETH_P_IP))
2000                 goto e_inval;
2001
2002         if (ZERONET(saddr))
2003                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2004         else {
2005                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2006                                           &itag);
2007                 if (err < 0)
2008                         goto martian_source;
2009                 if (err)
2010                         flags |= RTCF_DIRECTSRC;
2011         }
2012         flags |= RTCF_BROADCAST;
2013         res.type = RTN_BROADCAST;
2014         RT_CACHE_STAT_INC(in_brd);
2015
2016 local_input:
2017         rth = dst_alloc(&ipv4_dst_ops);
2018         if (!rth)
2019                 goto e_nobufs;
2020
2021         rth->u.dst.output= ip_rt_bug;
2022
2023         atomic_set(&rth->u.dst.__refcnt, 1);
2024         rth->u.dst.flags= DST_HOST;
2025         if (in_dev->cnf.no_policy)
2026                 rth->u.dst.flags |= DST_NOPOLICY;
2027         rth->fl.fl4_dst = daddr;
2028         rth->rt_dst     = daddr;
2029         rth->fl.fl4_tos = tos;
2030 #ifdef CONFIG_IP_ROUTE_FWMARK
2031         rth->fl.fl4_fwmark= skb->nfmark;
2032 #endif
2033         rth->fl.fl4_src = saddr;
2034         rth->rt_src     = saddr;
2035 #ifdef CONFIG_NET_CLS_ROUTE
2036         rth->u.dst.tclassid = itag;
2037 #endif
2038         rth->rt_iif     =
2039         rth->fl.iif     = dev->ifindex;
2040         rth->u.dst.dev  = &loopback_dev;
2041         dev_hold(rth->u.dst.dev);
2042         rth->idev       = in_dev_get(rth->u.dst.dev);
2043         rth->rt_gateway = daddr;
2044         rth->rt_spec_dst= spec_dst;
2045         rth->u.dst.input= ip_local_deliver;
2046         rth->rt_flags   = flags|RTCF_LOCAL;
2047         if (res.type == RTN_UNREACHABLE) {
2048                 rth->u.dst.input= ip_error;
2049                 rth->u.dst.error= -err;
2050                 rth->rt_flags   &= ~RTCF_LOCAL;
2051         }
2052         rth->rt_type    = res.type;
2053         hash = rt_hash(daddr, saddr, fl.iif);
2054         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2055         goto done;
2056
2057 no_route:
2058         RT_CACHE_STAT_INC(in_no_route);
2059         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2060         res.type = RTN_UNREACHABLE;
2061         goto local_input;
2062
2063         /*
2064          *      Do not cache martian addresses: they should be logged (RFC1812)
2065          */
2066 martian_destination:
2067         RT_CACHE_STAT_INC(in_martian_dst);
2068 #ifdef CONFIG_IP_ROUTE_VERBOSE
2069         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2070                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2071                         "%u.%u.%u.%u, dev %s\n",
2072                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2073 #endif
2074
2075 e_hostunreach:
2076         err = -EHOSTUNREACH;
2077         goto done;
2078
2079 e_inval:
2080         err = -EINVAL;
2081         goto done;
2082
2083 e_nobufs:
2084         err = -ENOBUFS;
2085         goto done;
2086
2087 martian_source:
2088         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2089         goto e_inval;
2090 }
2091
2092 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2093                    u8 tos, struct net_device *dev)
2094 {
2095         struct rtable * rth;
2096         unsigned        hash;
2097         int iif = dev->ifindex;
2098
2099         tos &= IPTOS_RT_MASK;
2100         hash = rt_hash(daddr, saddr, iif);
2101
2102         rcu_read_lock();
2103         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2104              rth = rcu_dereference(rth->u.rt_next)) {
2105                 if (rth->fl.fl4_dst == daddr &&
2106                     rth->fl.fl4_src == saddr &&
2107                     rth->fl.iif == iif &&
2108                     rth->fl.oif == 0 &&
2109 #ifdef CONFIG_IP_ROUTE_FWMARK
2110                     rth->fl.fl4_fwmark == skb->nfmark &&
2111 #endif
2112                     rth->fl.fl4_tos == tos) {
2113                         rth->u.dst.lastuse = jiffies;
2114                         dst_hold(&rth->u.dst);
2115                         rth->u.dst.__use++;
2116                         RT_CACHE_STAT_INC(in_hit);
2117                         rcu_read_unlock();
2118                         skb->dst = (struct dst_entry*)rth;
2119                         return 0;
2120                 }
2121                 RT_CACHE_STAT_INC(in_hlist_search);
2122         }
2123         rcu_read_unlock();
2124
2125         /* Multicast recognition logic is moved from route cache to here.
2126            The problem was that too many Ethernet cards have broken/missing
2127            hardware multicast filters :-( As result the host on multicasting
2128            network acquires a lot of useless route cache entries, sort of
2129            SDR messages from all the world. Now we try to get rid of them.
2130            Really, provided software IP multicast filter is organized
2131            reasonably (at least, hashed), it does not result in a slowdown
2132            comparing with route cache reject entries.
2133            Note, that multicast routers are not affected, because
2134            route cache entry is created eventually.
2135          */
2136         if (MULTICAST(daddr)) {
2137                 struct in_device *in_dev;
2138
2139                 rcu_read_lock();
2140                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2141                         int our = ip_check_mc(in_dev, daddr, saddr,
2142                                 skb->nh.iph->protocol);
2143                         if (our
2144 #ifdef CONFIG_IP_MROUTE
2145                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2146 #endif
2147                             ) {
2148                                 rcu_read_unlock();
2149                                 return ip_route_input_mc(skb, daddr, saddr,
2150                                                          tos, dev, our);
2151                         }
2152                 }
2153                 rcu_read_unlock();
2154                 return -EINVAL;
2155         }
2156         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2157 }
2158
2159 static inline int __mkroute_output(struct rtable **result,
2160                                    struct fib_result* res,
2161                                    const struct flowi *fl,
2162                                    const struct flowi *oldflp,
2163                                    struct net_device *dev_out,
2164                                    unsigned flags)
2165 {
2166         struct rtable *rth;
2167         struct in_device *in_dev;
2168         u32 tos = RT_FL_TOS(oldflp);
2169         int err = 0;
2170
2171         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2172                 return -EINVAL;
2173
2174         if (fl->fl4_dst == 0xFFFFFFFF)
2175                 res->type = RTN_BROADCAST;
2176         else if (MULTICAST(fl->fl4_dst))
2177                 res->type = RTN_MULTICAST;
2178         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2179                 return -EINVAL;
2180
2181         if (dev_out->flags & IFF_LOOPBACK)
2182                 flags |= RTCF_LOCAL;
2183
2184         /* get work reference to inet device */
2185         in_dev = in_dev_get(dev_out);
2186         if (!in_dev)
2187                 return -EINVAL;
2188
2189         if (res->type == RTN_BROADCAST) {
2190                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2191                 if (res->fi) {
2192                         fib_info_put(res->fi);
2193                         res->fi = NULL;
2194                 }
2195         } else if (res->type == RTN_MULTICAST) {
2196                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2197                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2198                                  oldflp->proto))
2199                         flags &= ~RTCF_LOCAL;
2200                 /* If multicast route do not exist use
2201                    default one, but do not gateway in this case.
2202                    Yes, it is hack.
2203                  */
2204                 if (res->fi && res->prefixlen < 4) {
2205                         fib_info_put(res->fi);
2206                         res->fi = NULL;
2207                 }
2208         }
2209
2210
2211         rth = dst_alloc(&ipv4_dst_ops);
2212         if (!rth) {
2213                 err = -ENOBUFS;
2214                 goto cleanup;
2215         }
2216
2217         atomic_set(&rth->u.dst.__refcnt, 1);
2218         rth->u.dst.flags= DST_HOST;
2219 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2220         if (res->fi) {
2221                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2222                 if (res->fi->fib_nhs > 1)
2223                         rth->u.dst.flags |= DST_BALANCED;
2224         }
2225 #endif
2226         if (in_dev->cnf.no_xfrm)
2227                 rth->u.dst.flags |= DST_NOXFRM;
2228         if (in_dev->cnf.no_policy)
2229                 rth->u.dst.flags |= DST_NOPOLICY;
2230
2231         rth->fl.fl4_dst = oldflp->fl4_dst;
2232         rth->fl.fl4_tos = tos;
2233         rth->fl.fl4_src = oldflp->fl4_src;
2234         rth->fl.oif     = oldflp->oif;
2235 #ifdef CONFIG_IP_ROUTE_FWMARK
2236         rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2237 #endif
2238         rth->rt_dst     = fl->fl4_dst;
2239         rth->rt_src     = fl->fl4_src;
2240         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2241         /* get references to the devices that are to be hold by the routing
2242            cache entry */
2243         rth->u.dst.dev  = dev_out;
2244         dev_hold(dev_out);
2245         rth->idev       = in_dev_get(dev_out);
2246         rth->rt_gateway = fl->fl4_dst;
2247         rth->rt_spec_dst= fl->fl4_src;
2248
2249         rth->u.dst.output=ip_output;
2250
2251         RT_CACHE_STAT_INC(out_slow_tot);
2252
2253         if (flags & RTCF_LOCAL) {
2254                 rth->u.dst.input = ip_local_deliver;
2255                 rth->rt_spec_dst = fl->fl4_dst;
2256         }
2257         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2258                 rth->rt_spec_dst = fl->fl4_src;
2259                 if (flags & RTCF_LOCAL &&
2260                     !(dev_out->flags & IFF_LOOPBACK)) {
2261                         rth->u.dst.output = ip_mc_output;
2262                         RT_CACHE_STAT_INC(out_slow_mc);
2263                 }
2264 #ifdef CONFIG_IP_MROUTE
2265                 if (res->type == RTN_MULTICAST) {
2266                         if (IN_DEV_MFORWARD(in_dev) &&
2267                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2268                                 rth->u.dst.input = ip_mr_input;
2269                                 rth->u.dst.output = ip_mc_output;
2270                         }
2271                 }
2272 #endif
2273         }
2274
2275         rt_set_nexthop(rth, res, 0);
2276
2277         rth->rt_flags = flags;
2278
2279         *result = rth;
2280  cleanup:
2281         /* release work reference to inet device */
2282         in_dev_put(in_dev);
2283
2284         return err;
2285 }
2286
2287 static inline int ip_mkroute_output_def(struct rtable **rp,
2288                                         struct fib_result* res,
2289                                         const struct flowi *fl,
2290                                         const struct flowi *oldflp,
2291                                         struct net_device *dev_out,
2292                                         unsigned flags)
2293 {
2294         struct rtable *rth = NULL;
2295         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2296         unsigned hash;
2297         if (err == 0) {
2298                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2299                 err = rt_intern_hash(hash, rth, rp);
2300         }
2301
2302         return err;
2303 }
2304
2305 static inline int ip_mkroute_output(struct rtable** rp,
2306                                     struct fib_result* res,
2307                                     const struct flowi *fl,
2308                                     const struct flowi *oldflp,
2309                                     struct net_device *dev_out,
2310                                     unsigned flags)
2311 {
2312 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2313         unsigned char hop;
2314         unsigned hash;
2315         int err = -EINVAL;
2316         struct rtable *rth = NULL;
2317
2318         if (res->fi && res->fi->fib_nhs > 1) {
2319                 unsigned char hopcount = res->fi->fib_nhs;
2320
2321                 for (hop = 0; hop < hopcount; hop++) {
2322                         struct net_device *dev2nexthop;
2323
2324                         res->nh_sel = hop;
2325
2326                         /* hold a work reference to the output device */
2327                         dev2nexthop = FIB_RES_DEV(*res);
2328                         dev_hold(dev2nexthop);
2329
2330                         /* put reference to previous result */
2331                         if (hop)
2332                                 ip_rt_put(*rp);
2333
2334                         err = __mkroute_output(&rth, res, fl, oldflp,
2335                                                dev2nexthop, flags);
2336
2337                         if (err != 0)
2338                                 goto cleanup;
2339
2340                         hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2341                                         oldflp->oif);
2342                         err = rt_intern_hash(hash, rth, rp);
2343
2344                         /* forward hop information to multipath impl. */
2345                         multipath_set_nhinfo(rth,
2346                                              FIB_RES_NETWORK(*res),
2347                                              FIB_RES_NETMASK(*res),
2348                                              res->prefixlen,
2349                                              &FIB_RES_NH(*res));
2350                 cleanup:
2351                         /* release work reference to output device */
2352                         dev_put(dev2nexthop);
2353
2354                         if (err != 0)
2355                                 return err;
2356                 }
2357                 return err;
2358         } else {
2359                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2360                                              flags);
2361         }
2362 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2363         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2364 #endif
2365 }
2366
2367 /*
2368  * Major route resolver routine.
2369  */
2370
2371 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2372 {
2373         u32 tos = RT_FL_TOS(oldflp);
2374         struct flowi fl = { .nl_u = { .ip4_u =
2375                                       { .daddr = oldflp->fl4_dst,
2376                                         .saddr = oldflp->fl4_src,
2377                                         .tos = tos & IPTOS_RT_MASK,
2378                                         .scope = ((tos & RTO_ONLINK) ?
2379                                                   RT_SCOPE_LINK :
2380                                                   RT_SCOPE_UNIVERSE),
2381 #ifdef CONFIG_IP_ROUTE_FWMARK
2382                                         .fwmark = oldflp->fl4_fwmark
2383 #endif
2384                                       } },
2385                             .iif = loopback_dev.ifindex,
2386                             .oif = oldflp->oif };
2387         struct fib_result res;
2388         unsigned flags = 0;
2389         struct net_device *dev_out = NULL;
2390         int free_res = 0;
2391         int err;
2392
2393
2394         res.fi          = NULL;
2395 #ifdef CONFIG_IP_MULTIPLE_TABLES
2396         res.r           = NULL;
2397 #endif
2398
2399         if (oldflp->fl4_src) {
2400                 err = -EINVAL;
2401                 if (MULTICAST(oldflp->fl4_src) ||
2402                     BADCLASS(oldflp->fl4_src) ||
2403                     ZERONET(oldflp->fl4_src))
2404                         goto out;
2405
2406                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2407                 dev_out = ip_dev_find(oldflp->fl4_src);
2408                 if (dev_out == NULL)
2409                         goto out;
2410
2411                 /* I removed check for oif == dev_out->oif here.
2412                    It was wrong for two reasons:
2413                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2414                       assigned to multiple interfaces.
2415                    2. Moreover, we are allowed to send packets with saddr
2416                       of another iface. --ANK
2417                  */
2418
2419                 if (oldflp->oif == 0
2420                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2421                         /* Special hack: user can direct multicasts
2422                            and limited broadcast via necessary interface
2423                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2424                            This hack is not just for fun, it allows
2425                            vic,vat and friends to work.
2426                            They bind socket to loopback, set ttl to zero
2427                            and expect that it will work.
2428                            From the viewpoint of routing cache they are broken,
2429                            because we are not allowed to build multicast path
2430                            with loopback source addr (look, routing cache
2431                            cannot know, that ttl is zero, so that packet
2432                            will not leave this host and route is valid).
2433                            Luckily, this hack is good workaround.
2434                          */
2435
2436                         fl.oif = dev_out->ifindex;
2437                         goto make_route;
2438                 }
2439                 if (dev_out)
2440                         dev_put(dev_out);
2441                 dev_out = NULL;
2442         }
2443
2444
2445         if (oldflp->oif) {
2446                 dev_out = dev_get_by_index(oldflp->oif);
2447                 err = -ENODEV;
2448                 if (dev_out == NULL)
2449                         goto out;
2450
2451                 /* RACE: Check return value of inet_select_addr instead. */
2452                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2453                         dev_put(dev_out);
2454                         goto out;       /* Wrong error code */
2455                 }
2456
2457                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2458                         if (!fl.fl4_src)
2459                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2460                                                               RT_SCOPE_LINK);
2461                         goto make_route;
2462                 }
2463                 if (!fl.fl4_src) {
2464                         if (MULTICAST(oldflp->fl4_dst))
2465                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2466                                                               fl.fl4_scope);
2467                         else if (!oldflp->fl4_dst)
2468                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2469                                                               RT_SCOPE_HOST);
2470                 }
2471         }
2472
2473         if (!fl.fl4_dst) {
2474                 fl.fl4_dst = fl.fl4_src;
2475                 if (!fl.fl4_dst)
2476                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2477                 if (dev_out)
2478                         dev_put(dev_out);
2479                 dev_out = &loopback_dev;
2480                 dev_hold(dev_out);
2481                 fl.oif = loopback_dev.ifindex;
2482                 res.type = RTN_LOCAL;
2483                 flags |= RTCF_LOCAL;
2484                 goto make_route;
2485         }
2486
2487         if (fib_lookup(&fl, &res)) {
2488                 res.fi = NULL;
2489                 if (oldflp->oif) {
2490                         /* Apparently, routing tables are wrong. Assume,
2491                            that the destination is on link.
2492
2493                            WHY? DW.
2494                            Because we are allowed to send to iface
2495                            even if it has NO routes and NO assigned
2496                            addresses. When oif is specified, routing
2497                            tables are looked up with only one purpose:
2498                            to catch if destination is gatewayed, rather than
2499                            direct. Moreover, if MSG_DONTROUTE is set,
2500                            we send packet, ignoring both routing tables
2501                            and ifaddr state. --ANK
2502
2503
2504                            We could make it even if oif is unknown,
2505                            likely IPv6, but we do not.
2506                          */
2507
2508                         if (fl.fl4_src == 0)
2509                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2510                                                               RT_SCOPE_LINK);
2511                         res.type = RTN_UNICAST;
2512                         goto make_route;
2513                 }
2514                 if (dev_out)
2515                         dev_put(dev_out);
2516                 err = -ENETUNREACH;
2517                 goto out;
2518         }
2519         free_res = 1;
2520
2521         if (res.type == RTN_LOCAL) {
2522                 if (!fl.fl4_src)
2523                         fl.fl4_src = fl.fl4_dst;
2524                 if (dev_out)
2525                         dev_put(dev_out);
2526                 dev_out = &loopback_dev;
2527                 dev_hold(dev_out);
2528                 fl.oif = dev_out->ifindex;
2529                 if (res.fi)
2530                         fib_info_put(res.fi);
2531                 res.fi = NULL;
2532                 flags |= RTCF_LOCAL;
2533                 goto make_route;
2534         }
2535
2536 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2537         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2538                 fib_select_multipath(&fl, &res);
2539         else
2540 #endif
2541         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2542                 fib_select_default(&fl, &res);
2543
2544         if (!fl.fl4_src)
2545                 fl.fl4_src = FIB_RES_PREFSRC(res);
2546
2547         if (dev_out)
2548                 dev_put(dev_out);
2549         dev_out = FIB_RES_DEV(res);
2550         dev_hold(dev_out);
2551         fl.oif = dev_out->ifindex;
2552
2553
2554 make_route:
2555         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2556
2557
2558         if (free_res)
2559                 fib_res_put(&res);
2560         if (dev_out)
2561                 dev_put(dev_out);
2562 out:    return err;
2563 }
2564
2565 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2566 {
2567         unsigned hash;
2568         struct rtable *rth;
2569
2570         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2571
2572         rcu_read_lock_bh();
2573         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2574                 rth = rcu_dereference(rth->u.rt_next)) {
2575                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2576                     rth->fl.fl4_src == flp->fl4_src &&
2577                     rth->fl.iif == 0 &&
2578                     rth->fl.oif == flp->oif &&
2579 #ifdef CONFIG_IP_ROUTE_FWMARK
2580                     rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2581 #endif
2582                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2583                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2584
2585                         /* check for multipath routes and choose one if
2586                          * necessary
2587                          */
2588                         if (multipath_select_route(flp, rth, rp)) {
2589                                 dst_hold(&(*rp)->u.dst);
2590                                 RT_CACHE_STAT_INC(out_hit);
2591                                 rcu_read_unlock_bh();
2592                                 return 0;
2593                         }
2594
2595                         rth->u.dst.lastuse = jiffies;
2596                         dst_hold(&rth->u.dst);
2597                         rth->u.dst.__use++;
2598                         RT_CACHE_STAT_INC(out_hit);
2599                         rcu_read_unlock_bh();
2600                         *rp = rth;
2601                         return 0;
2602                 }
2603                 RT_CACHE_STAT_INC(out_hlist_search);
2604         }
2605         rcu_read_unlock_bh();
2606
2607         return ip_route_output_slow(rp, flp);
2608 }
2609
2610 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2611
2612 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2613 {
2614         int err;
2615
2616         if ((err = __ip_route_output_key(rp, flp)) != 0)
2617                 return err;
2618
2619         if (flp->proto) {
2620                 if (!flp->fl4_src)
2621                         flp->fl4_src = (*rp)->rt_src;
2622                 if (!flp->fl4_dst)
2623                         flp->fl4_dst = (*rp)->rt_dst;
2624                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2625         }
2626
2627         return 0;
2628 }
2629
2630 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2631
2632 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2633 {
2634         return ip_route_output_flow(rp, flp, NULL, 0);
2635 }
2636
2637 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2638                         int nowait, unsigned int flags)
2639 {
2640         struct rtable *rt = (struct rtable*)skb->dst;
2641         struct rtmsg *r;
2642         struct nlmsghdr *nlh;
2643         struct rta_cacheinfo ci;
2644
2645         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2646         if (nlh == NULL)
2647                 return -ENOBUFS;
2648
2649         r = nlmsg_data(nlh);
2650         r->rtm_family    = AF_INET;
2651         r->rtm_dst_len  = 32;
2652         r->rtm_src_len  = 0;
2653         r->rtm_tos      = rt->fl.fl4_tos;
2654         r->rtm_table    = RT_TABLE_MAIN;
2655         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2656         r->rtm_type     = rt->rt_type;
2657         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2658         r->rtm_protocol = RTPROT_UNSPEC;
2659         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2660         if (rt->rt_flags & RTCF_NOTIFY)
2661                 r->rtm_flags |= RTM_F_NOTIFY;
2662
2663         NLA_PUT_U32(skb, RTA_DST, rt->rt_dst);
2664
2665         if (rt->fl.fl4_src) {
2666                 r->rtm_src_len = 32;
2667                 NLA_PUT_U32(skb, RTA_SRC, rt->fl.fl4_src);
2668         }
2669         if (rt->u.dst.dev)
2670                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2671 #ifdef CONFIG_NET_CLS_ROUTE
2672         if (rt->u.dst.tclassid)
2673                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2674 #endif
2675 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2676         if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2677                 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2678 #endif
2679         if (rt->fl.iif)
2680                 NLA_PUT_U32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2681         else if (rt->rt_src != rt->fl.fl4_src)
2682                 NLA_PUT_U32(skb, RTA_PREFSRC, rt->rt_src);
2683
2684         if (rt->rt_dst != rt->rt_gateway)
2685                 NLA_PUT_U32(skb, RTA_GATEWAY, rt->rt_gateway);
2686
2687         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2688                 goto nla_put_failure;
2689
2690         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2691         ci.rta_used     = rt->u.dst.__use;
2692         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2693         if (rt->u.dst.expires)
2694                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2695         else
2696                 ci.rta_expires = 0;
2697         ci.rta_error    = rt->u.dst.error;
2698         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2699         if (rt->peer) {
2700                 ci.rta_id = rt->peer->ip_id_count;
2701                 if (rt->peer->tcp_ts_stamp) {
2702                         ci.rta_ts = rt->peer->tcp_ts;
2703                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2704                 }
2705         }
2706
2707         if (rt->fl.iif) {
2708 #ifdef CONFIG_IP_MROUTE
2709                 u32 dst = rt->rt_dst;
2710
2711                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2712                     ipv4_devconf.mc_forwarding) {
2713                         int err = ipmr_get_route(skb, r, nowait);
2714                         if (err <= 0) {
2715                                 if (!nowait) {
2716                                         if (err == 0)
2717                                                 return 0;
2718                                         goto nla_put_failure;
2719                                 } else {
2720                                         if (err == -EMSGSIZE)
2721                                                 goto nla_put_failure;
2722                                         ci.rta_error = err;
2723                                 }
2724                         }
2725                 } else
2726 #endif
2727                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2728         }
2729
2730         NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2731
2732         return nlmsg_end(skb, nlh);
2733
2734 nla_put_failure:
2735         return nlmsg_cancel(skb, nlh);
2736 }
2737
2738 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2739 {
2740         struct rtmsg *rtm;
2741         struct nlattr *tb[RTA_MAX+1];
2742         struct rtable *rt = NULL;
2743         __be32 dst = 0;
2744         __be32 src = 0;
2745         u32 iif;
2746         int err;
2747         struct sk_buff *skb;
2748
2749         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2750         if (err < 0)
2751                 goto errout;
2752
2753         rtm = nlmsg_data(nlh);
2754
2755         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2756         if (skb == NULL) {
2757                 err = -ENOBUFS;
2758                 goto errout;
2759         }
2760
2761         /* Reserve room for dummy headers, this skb can pass
2762            through good chunk of routing engine.
2763          */
2764         skb->mac.raw = skb->nh.raw = skb->data;
2765
2766         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2767         skb->nh.iph->protocol = IPPROTO_ICMP;
2768         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2769
2770         src = tb[RTA_SRC] ? nla_get_u32(tb[RTA_SRC]) : 0;
2771         dst = tb[RTA_DST] ? nla_get_u32(tb[RTA_DST]) : 0;
2772         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2773
2774         if (iif) {
2775                 struct net_device *dev;
2776
2777                 dev = __dev_get_by_index(iif);
2778                 if (dev == NULL) {
2779                         err = -ENODEV;
2780                         goto errout_free;
2781                 }
2782
2783                 skb->protocol   = htons(ETH_P_IP);
2784                 skb->dev        = dev;
2785                 local_bh_disable();
2786                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2787                 local_bh_enable();
2788
2789                 rt = (struct rtable*) skb->dst;
2790                 if (err == 0 && rt->u.dst.error)
2791                         err = -rt->u.dst.error;
2792         } else {
2793                 struct flowi fl = {
2794                         .nl_u = {
2795                                 .ip4_u = {
2796                                         .daddr = dst,
2797                                         .saddr = src,
2798                                         .tos = rtm->rtm_tos,
2799                                 },
2800                         },
2801                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2802                 };
2803                 err = ip_route_output_key(&rt, &fl);
2804         }
2805
2806         if (err)
2807                 goto errout_free;
2808
2809         skb->dst = &rt->u.dst;
2810         if (rtm->rtm_flags & RTM_F_NOTIFY)
2811                 rt->rt_flags |= RTCF_NOTIFY;
2812
2813         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2814                                 RTM_NEWROUTE, 0, 0);
2815         if (err <= 0)
2816                 goto errout_free;
2817
2818         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2819 errout:
2820         return err;
2821
2822 errout_free:
2823         kfree_skb(skb);
2824         goto errout;
2825 }
2826
2827 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2828 {
2829         struct rtable *rt;
2830         int h, s_h;
2831         int idx, s_idx;
2832
2833         s_h = cb->args[0];
2834         s_idx = idx = cb->args[1];
2835         for (h = 0; h <= rt_hash_mask; h++) {
2836                 if (h < s_h) continue;
2837                 if (h > s_h)
2838                         s_idx = 0;
2839                 rcu_read_lock_bh();
2840                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2841                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2842                         if (idx < s_idx)
2843                                 continue;
2844                         skb->dst = dst_clone(&rt->u.dst);
2845                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2846                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2847                                          1, NLM_F_MULTI) <= 0) {
2848                                 dst_release(xchg(&skb->dst, NULL));
2849                                 rcu_read_unlock_bh();
2850                                 goto done;
2851                         }
2852                         dst_release(xchg(&skb->dst, NULL));
2853                 }
2854                 rcu_read_unlock_bh();
2855         }
2856
2857 done:
2858         cb->args[0] = h;
2859         cb->args[1] = idx;
2860         return skb->len;
2861 }
2862
2863 void ip_rt_multicast_event(struct in_device *in_dev)
2864 {
2865         rt_cache_flush(0);
2866 }
2867
2868 #ifdef CONFIG_SYSCTL
2869 static int flush_delay;
2870
2871 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2872                                         struct file *filp, void __user *buffer,
2873                                         size_t *lenp, loff_t *ppos)
2874 {
2875         if (write) {
2876                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2877                 rt_cache_flush(flush_delay);
2878                 return 0;
2879         }
2880
2881         return -EINVAL;
2882 }
2883
2884 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2885                                                 int __user *name,
2886                                                 int nlen,
2887                                                 void __user *oldval,
2888                                                 size_t __user *oldlenp,
2889                                                 void __user *newval,
2890                                                 size_t newlen,
2891                                                 void **context)
2892 {
2893         int delay;
2894         if (newlen != sizeof(int))
2895                 return -EINVAL;
2896         if (get_user(delay, (int __user *)newval))
2897                 return -EFAULT;
2898         rt_cache_flush(delay);
2899         return 0;
2900 }
2901
2902 ctl_table ipv4_route_table[] = {
2903         {
2904                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2905                 .procname       = "flush",
2906                 .data           = &flush_delay,
2907                 .maxlen         = sizeof(int),
2908                 .mode           = 0200,
2909                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2910                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2911         },
2912         {
2913                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2914                 .procname       = "min_delay",
2915                 .data           = &ip_rt_min_delay,
2916                 .maxlen         = sizeof(int),
2917                 .mode           = 0644,
2918                 .proc_handler   = &proc_dointvec_jiffies,
2919                 .strategy       = &sysctl_jiffies,
2920         },
2921         {
2922                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2923                 .procname       = "max_delay",
2924                 .data           = &ip_rt_max_delay,
2925                 .maxlen         = sizeof(int),
2926                 .mode           = 0644,
2927                 .proc_handler   = &proc_dointvec_jiffies,
2928                 .strategy       = &sysctl_jiffies,
2929         },
2930         {
2931                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2932                 .procname       = "gc_thresh",
2933                 .data           = &ipv4_dst_ops.gc_thresh,
2934                 .maxlen         = sizeof(int),
2935                 .mode           = 0644,
2936                 .proc_handler   = &proc_dointvec,
2937         },
2938         {
2939                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2940                 .procname       = "max_size",
2941                 .data           = &ip_rt_max_size,
2942                 .maxlen         = sizeof(int),
2943                 .mode           = 0644,
2944                 .proc_handler   = &proc_dointvec,
2945         },
2946         {
2947                 /*  Deprecated. Use gc_min_interval_ms */
2948
2949                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2950                 .procname       = "gc_min_interval",
2951                 .data           = &ip_rt_gc_min_interval,
2952                 .maxlen         = sizeof(int),
2953                 .mode           = 0644,
2954                 .proc_handler   = &proc_dointvec_jiffies,
2955                 .strategy       = &sysctl_jiffies,
2956         },
2957         {
2958                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2959                 .procname       = "gc_min_interval_ms",
2960                 .data           = &ip_rt_gc_min_interval,
2961                 .maxlen         = sizeof(int),
2962                 .mode           = 0644,
2963                 .proc_handler   = &proc_dointvec_ms_jiffies,
2964                 .strategy       = &sysctl_ms_jiffies,
2965         },
2966         {
2967                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2968                 .procname       = "gc_timeout",
2969                 .data           = &ip_rt_gc_timeout,
2970                 .maxlen         = sizeof(int),
2971                 .mode           = 0644,
2972                 .proc_handler   = &proc_dointvec_jiffies,
2973                 .strategy       = &sysctl_jiffies,
2974         },
2975         {
2976                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2977                 .procname       = "gc_interval",
2978                 .data           = &ip_rt_gc_interval,
2979                 .maxlen         = sizeof(int),
2980                 .mode           = 0644,
2981                 .proc_handler   = &proc_dointvec_jiffies,
2982                 .strategy       = &sysctl_jiffies,
2983         },
2984         {
2985                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2986                 .procname       = "redirect_load",
2987                 .data           = &ip_rt_redirect_load,
2988                 .maxlen         = sizeof(int),
2989                 .mode           = 0644,
2990                 .proc_handler   = &proc_dointvec,
2991         },
2992         {
2993                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2994                 .procname       = "redirect_number",
2995                 .data           = &ip_rt_redirect_number,
2996                 .maxlen         = sizeof(int),
2997                 .mode           = 0644,
2998                 .proc_handler   = &proc_dointvec,
2999         },
3000         {
3001                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3002                 .procname       = "redirect_silence",
3003                 .data           = &ip_rt_redirect_silence,
3004                 .maxlen         = sizeof(int),
3005                 .mode           = 0644,
3006                 .proc_handler   = &proc_dointvec,
3007         },
3008         {
3009                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
3010                 .procname       = "error_cost",
3011                 .data           = &ip_rt_error_cost,
3012                 .maxlen         = sizeof(int),
3013                 .mode           = 0644,
3014                 .proc_handler   = &proc_dointvec,
3015         },
3016         {
3017                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3018                 .procname       = "error_burst",
3019                 .data           = &ip_rt_error_burst,
3020                 .maxlen         = sizeof(int),
3021                 .mode           = 0644,
3022                 .proc_handler   = &proc_dointvec,
3023         },
3024         {
3025                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3026                 .procname       = "gc_elasticity",
3027                 .data           = &ip_rt_gc_elasticity,
3028                 .maxlen         = sizeof(int),
3029                 .mode           = 0644,
3030                 .proc_handler   = &proc_dointvec,
3031         },
3032         {
3033                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3034                 .procname       = "mtu_expires",
3035                 .data           = &ip_rt_mtu_expires,
3036                 .maxlen         = sizeof(int),
3037                 .mode           = 0644,
3038                 .proc_handler   = &proc_dointvec_jiffies,
3039                 .strategy       = &sysctl_jiffies,
3040         },
3041         {
3042                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3043                 .procname       = "min_pmtu",
3044                 .data           = &ip_rt_min_pmtu,
3045                 .maxlen         = sizeof(int),
3046                 .mode           = 0644,
3047                 .proc_handler   = &proc_dointvec,
3048         },
3049         {
3050                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3051                 .procname       = "min_adv_mss",
3052                 .data           = &ip_rt_min_advmss,
3053                 .maxlen         = sizeof(int),
3054                 .mode           = 0644,
3055                 .proc_handler   = &proc_dointvec,
3056         },
3057         {
3058                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3059                 .procname       = "secret_interval",
3060                 .data           = &ip_rt_secret_interval,
3061                 .maxlen         = sizeof(int),
3062                 .mode           = 0644,
3063                 .proc_handler   = &proc_dointvec_jiffies,
3064                 .strategy       = &sysctl_jiffies,
3065         },
3066         { .ctl_name = 0 }
3067 };
3068 #endif
3069
3070 #ifdef CONFIG_NET_CLS_ROUTE
3071 struct ip_rt_acct *ip_rt_acct;
3072
3073 /* This code sucks.  But you should have seen it before! --RR */
3074
3075 /* IP route accounting ptr for this logical cpu number. */
3076 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3077
3078 #ifdef CONFIG_PROC_FS
3079 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3080                            int length, int *eof, void *data)
3081 {
3082         unsigned int i;
3083
3084         if ((offset & 3) || (length & 3))
3085                 return -EIO;
3086
3087         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3088                 *eof = 1;
3089                 return 0;
3090         }
3091
3092         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3093                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3094                 *eof = 1;
3095         }
3096
3097         offset /= sizeof(u32);
3098
3099         if (length > 0) {
3100                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3101                 u32 *dst = (u32 *) buffer;
3102
3103                 /* Copy first cpu. */
3104                 *start = buffer;
3105                 memcpy(dst, src, length);
3106
3107                 /* Add the other cpus in, one int at a time */
3108                 for_each_possible_cpu(i) {
3109                         unsigned int j;
3110
3111                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3112
3113                         for (j = 0; j < length/4; j++)
3114                                 dst[j] += src[j];
3115                 }
3116         }
3117         return length;
3118 }
3119 #endif /* CONFIG_PROC_FS */
3120 #endif /* CONFIG_NET_CLS_ROUTE */
3121
3122 static __initdata unsigned long rhash_entries;
3123 static int __init set_rhash_entries(char *str)
3124 {
3125         if (!str)
3126                 return 0;
3127         rhash_entries = simple_strtoul(str, &str, 0);
3128         return 1;
3129 }
3130 __setup("rhash_entries=", set_rhash_entries);
3131
3132 int __init ip_rt_init(void)
3133 {
3134         int rc = 0;
3135
3136         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3137                              (jiffies ^ (jiffies >> 7)));
3138
3139 #ifdef CONFIG_NET_CLS_ROUTE
3140         {
3141         int order;
3142         for (order = 0;
3143              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3144                 /* NOTHING */;
3145         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3146         if (!ip_rt_acct)
3147                 panic("IP: failed to allocate ip_rt_acct\n");
3148         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3149         }
3150 #endif
3151
3152         ipv4_dst_ops.kmem_cachep =
3153                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3154                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3155
3156         rt_hash_table = (struct rt_hash_bucket *)
3157                 alloc_large_system_hash("IP route cache",
3158                                         sizeof(struct rt_hash_bucket),
3159                                         rhash_entries,
3160                                         (num_physpages >= 128 * 1024) ?
3161                                         15 : 17,
3162                                         0,
3163                                         &rt_hash_log,
3164                                         &rt_hash_mask,
3165                                         0);
3166         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3167         rt_hash_lock_init();
3168
3169         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3170         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3171
3172         devinet_init();
3173         ip_fib_init();
3174
3175         init_timer(&rt_flush_timer);
3176         rt_flush_timer.function = rt_run_flush;
3177         init_timer(&rt_periodic_timer);
3178         rt_periodic_timer.function = rt_check_expire;
3179         init_timer(&rt_secret_timer);
3180         rt_secret_timer.function = rt_secret_rebuild;
3181
3182         /* All the timers, started at system startup tend
3183            to synchronize. Perturb it a bit.
3184          */
3185         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3186                                         ip_rt_gc_interval;
3187         add_timer(&rt_periodic_timer);
3188
3189         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3190                 ip_rt_secret_interval;
3191         add_timer(&rt_secret_timer);
3192
3193 #ifdef CONFIG_PROC_FS
3194         {
3195         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3196         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3197             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3198                                              proc_net_stat))) {
3199                 return -ENOMEM;
3200         }
3201         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3202         }
3203 #ifdef CONFIG_NET_CLS_ROUTE
3204         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3205 #endif
3206 #endif
3207 #ifdef CONFIG_XFRM
3208         xfrm_init();
3209         xfrm4_init();
3210 #endif
3211         return rc;
3212 }
3213
3214 EXPORT_SYMBOL(__ip_select_ident);
3215 EXPORT_SYMBOL(ip_route_input);
3216 EXPORT_SYMBOL(ip_route_output_key);