net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111
 112 #define RT_FL_TOS(oldflp) \
 113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 114
 115 #define IP_MAX_MTU      0xFFF0
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_max_size;
 120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 121 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_gc_elasticity __read_mostly    = 8;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132 static int rt_chain_length_max __read_mostly    = 20;
 133
 134 static struct delayed_work expires_work;
 135 static unsigned long expires_ljiffies;
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 143 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 144 static void              ipv4_dst_destroy(struct dst_entry *dst);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 148 static int rt_garbage_collect(struct dst_ops *ops);
 149
 150 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 151                             int how)
 152 {
 153 }
 154
 155 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 156 {
 157         struct rtable *rt = (struct rtable *) dst;
 158         struct inet_peer *peer;
 159         u32 *p = NULL;
 160
 161         if (!rt->peer)
 162                 rt_bind_peer(rt, 1);
 163
 164         peer = rt->peer;
 165         if (peer) {
 166                 u32 *old_p = __DST_METRICS_PTR(old);
 167                 unsigned long prev, new;
 168
 169                 p = peer->metrics;
 170                 if (inet_metrics_new(peer))
 171                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 172
 173                 new = (unsigned long) p;
 174                 prev = cmpxchg(&dst->_metrics, old, new);
 175
 176                 if (prev != old) {
 177                         p = __DST_METRICS_PTR(prev);
 178                         if (prev & DST_METRICS_READ_ONLY)
 179                                 p = NULL;
 180                 } else {
 181                         if (rt->fi) {
 182                                 fib_info_put(rt->fi);
 183                                 rt->fi = NULL;
 184                         }
 185                 }
 186         }
 187         return p;
 188 }
 189
 190 static struct dst_ops ipv4_dst_ops = {
 191         .family =               AF_INET,
 192         .protocol =             cpu_to_be16(ETH_P_IP),
 193         .gc =                   rt_garbage_collect,
 194         .check =                ipv4_dst_check,
 195         .default_advmss =       ipv4_default_advmss,
 196         .default_mtu =          ipv4_default_mtu,
 197         .cow_metrics =          ipv4_cow_metrics,
 198         .destroy =              ipv4_dst_destroy,
 199         .ifdown =               ipv4_dst_ifdown,
 200         .negative_advice =      ipv4_negative_advice,
 201         .link_failure =         ipv4_link_failure,
 202         .update_pmtu =          ip_rt_update_pmtu,
 203         .local_out =            __ip_local_out,
 204 };
 205
 206 #define ECN_OR_COST(class)      TC_PRIO_##class
 207
 208 const __u8 ip_tos2prio[16] = {
 209         TC_PRIO_BESTEFFORT,
 210         ECN_OR_COST(FILLER),
 211         TC_PRIO_BESTEFFORT,
 212         ECN_OR_COST(BESTEFFORT),
 213         TC_PRIO_BULK,
 214         ECN_OR_COST(BULK),
 215         TC_PRIO_BULK,
 216         ECN_OR_COST(BULK),
 217         TC_PRIO_INTERACTIVE,
 218         ECN_OR_COST(INTERACTIVE),
 219         TC_PRIO_INTERACTIVE,
 220         ECN_OR_COST(INTERACTIVE),
 221         TC_PRIO_INTERACTIVE_BULK,
 222         ECN_OR_COST(INTERACTIVE_BULK),
 223         TC_PRIO_INTERACTIVE_BULK,
 224         ECN_OR_COST(INTERACTIVE_BULK)
 225 };
 226
 227
 228 /*
 229  * Route cache.
 230  */
 231
 232 /* The locking scheme is rather straight forward:
 233  *
 234  * 1) Read-Copy Update protects the buckets of the central route hash.
 235  * 2) Only writers remove entries, and they hold the lock
 236  *    as they look at rtable reference counts.
 237  * 3) Only readers acquire references to rtable entries,
 238  *    they do so with atomic increments and with the
 239  *    lock held.
 240  */
 241
 242 struct rt_hash_bucket {
 243         struct rtable __rcu     *chain;
 244 };
 245
 246 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 247         defined(CONFIG_PROVE_LOCKING)
 248 /*
 249  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 250  * The size of this table is a power of two and depends on the number of CPUS.
 251  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 252  */
 253 #ifdef CONFIG_LOCKDEP
 254 # define RT_HASH_LOCK_SZ        256
 255 #else
 256 # if NR_CPUS >= 32
 257 #  define RT_HASH_LOCK_SZ       4096
 258 # elif NR_CPUS >= 16
 259 #  define RT_HASH_LOCK_SZ       2048
 260 # elif NR_CPUS >= 8
 261 #  define RT_HASH_LOCK_SZ       1024
 262 # elif NR_CPUS >= 4
 263 #  define RT_HASH_LOCK_SZ       512
 264 # else
 265 #  define RT_HASH_LOCK_SZ       256
 266 # endif
 267 #endif
 268
 269 static spinlock_t       *rt_hash_locks;
 270 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 271
 272 static __init void rt_hash_lock_init(void)
 273 {
 274         int i;
 275
 276         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 277                         GFP_KERNEL);
 278         if (!rt_hash_locks)
 279                 panic("IP: failed to allocate rt_hash_locks\n");
 280
 281         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 282                 spin_lock_init(&rt_hash_locks[i]);
 283 }
 284 #else
 285 # define rt_hash_lock_addr(slot) NULL
 286
 287 static inline void rt_hash_lock_init(void)
 288 {
 289 }
 290 #endif
 291
 292 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 293 static unsigned                 rt_hash_mask __read_mostly;
 294 static unsigned int             rt_hash_log  __read_mostly;
 295
 296 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 297 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 298
 299 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 300                                    int genid)
 301 {
 302         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 303                             idx, genid)
 304                 & rt_hash_mask;
 305 }
 306
 307 static inline int rt_genid(struct net *net)
 308 {
 309         return atomic_read(&net->ipv4.rt_genid);
 310 }
 311
 312 #ifdef CONFIG_PROC_FS
 313 struct rt_cache_iter_state {
 314         struct seq_net_private p;
 315         int bucket;
 316         int genid;
 317 };
 318
 319 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 320 {
 321         struct rt_cache_iter_state *st = seq->private;
 322         struct rtable *r = NULL;
 323
 324         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 325                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 326                         continue;
 327                 rcu_read_lock_bh();
 328                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 329                 while (r) {
 330                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 331                             r->rt_genid == st->genid)
 332                                 return r;
 333                         r = rcu_dereference_bh(r->dst.rt_next);
 334                 }
 335                 rcu_read_unlock_bh();
 336         }
 337         return r;
 338 }
 339
 340 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 341                                           struct rtable *r)
 342 {
 343         struct rt_cache_iter_state *st = seq->private;
 344
 345         r = rcu_dereference_bh(r->dst.rt_next);
 346         while (!r) {
 347                 rcu_read_unlock_bh();
 348                 do {
 349                         if (--st->bucket < 0)
 350                                 return NULL;
 351                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 352                 rcu_read_lock_bh();
 353                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 354         }
 355         return r;
 356 }
 357
 358 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 359                                         struct rtable *r)
 360 {
 361         struct rt_cache_iter_state *st = seq->private;
 362         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 363                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 364                         continue;
 365                 if (r->rt_genid == st->genid)
 366                         break;
 367         }
 368         return r;
 369 }
 370
 371 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 372 {
 373         struct rtable *r = rt_cache_get_first(seq);
 374
 375         if (r)
 376                 while (pos && (r = rt_cache_get_next(seq, r)))
 377                         --pos;
 378         return pos ? NULL : r;
 379 }
 380
 381 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 382 {
 383         struct rt_cache_iter_state *st = seq->private;
 384         if (*pos)
 385                 return rt_cache_get_idx(seq, *pos - 1);
 386         st->genid = rt_genid(seq_file_net(seq));
 387         return SEQ_START_TOKEN;
 388 }
 389
 390 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 391 {
 392         struct rtable *r;
 393
 394         if (v == SEQ_START_TOKEN)
 395                 r = rt_cache_get_first(seq);
 396         else
 397                 r = rt_cache_get_next(seq, v);
 398         ++*pos;
 399         return r;
 400 }
 401
 402 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 403 {
 404         if (v && v != SEQ_START_TOKEN)
 405                 rcu_read_unlock_bh();
 406 }
 407
 408 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 409 {
 410         if (v == SEQ_START_TOKEN)
 411                 seq_printf(seq, "%-127s\n",
 412                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 413                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 414                            "HHUptod\tSpecDst");
 415         else {
 416                 struct rtable *r = v;
 417                 int len;
 418
 419                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 420                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 421                         r->dst.dev ? r->dst.dev->name : "*",
 422                         (__force u32)r->rt_dst,
 423                         (__force u32)r->rt_gateway,
 424                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 425                         r->dst.__use, 0, (__force u32)r->rt_src,
 426                         dst_metric_advmss(&r->dst) + 40,
 427                         dst_metric(&r->dst, RTAX_WINDOW),
 428                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 429                               dst_metric(&r->dst, RTAX_RTTVAR)),
 430                         r->fl.fl4_tos,
 431                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
 432                         r->dst.hh ? (r->dst.hh->hh_output ==
 433                                        dev_queue_xmit) : 0,
 434                         r->rt_spec_dst, &len);
 435
 436                 seq_printf(seq, "%*s\n", 127 - len, "");
 437         }
 438         return 0;
 439 }
 440
 441 static const struct seq_operations rt_cache_seq_ops = {
 442         .start  = rt_cache_seq_start,
 443         .next   = rt_cache_seq_next,
 444         .stop   = rt_cache_seq_stop,
 445         .show   = rt_cache_seq_show,
 446 };
 447
 448 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 449 {
 450         return seq_open_net(inode, file, &rt_cache_seq_ops,
 451                         sizeof(struct rt_cache_iter_state));
 452 }
 453
 454 static const struct file_operations rt_cache_seq_fops = {
 455         .owner   = THIS_MODULE,
 456         .open    = rt_cache_seq_open,
 457         .read    = seq_read,
 458         .llseek  = seq_lseek,
 459         .release = seq_release_net,
 460 };
 461
 462
 463 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 464 {
 465         int cpu;
 466
 467         if (*pos == 0)
 468                 return SEQ_START_TOKEN;
 469
 470         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 471                 if (!cpu_possible(cpu))
 472                         continue;
 473                 *pos = cpu+1;
 474                 return &per_cpu(rt_cache_stat, cpu);
 475         }
 476         return NULL;
 477 }
 478
 479 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 480 {
 481         int cpu;
 482
 483         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 484                 if (!cpu_possible(cpu))
 485                         continue;
 486                 *pos = cpu+1;
 487                 return &per_cpu(rt_cache_stat, cpu);
 488         }
 489         return NULL;
 490
 491 }
 492
 493 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 494 {
 495
 496 }
 497
 498 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 499 {
 500         struct rt_cache_stat *st = v;
 501
 502         if (v == SEQ_START_TOKEN) {
 503                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 504                 return 0;
 505         }
 506
 507         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 508                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 509                    dst_entries_get_slow(&ipv4_dst_ops),
 510                    st->in_hit,
 511                    st->in_slow_tot,
 512                    st->in_slow_mc,
 513                    st->in_no_route,
 514                    st->in_brd,
 515                    st->in_martian_dst,
 516                    st->in_martian_src,
 517
 518                    st->out_hit,
 519                    st->out_slow_tot,
 520                    st->out_slow_mc,
 521
 522                    st->gc_total,
 523                    st->gc_ignored,
 524                    st->gc_goal_miss,
 525                    st->gc_dst_overflow,
 526                    st->in_hlist_search,
 527                    st->out_hlist_search
 528                 );
 529         return 0;
 530 }
 531
 532 static const struct seq_operations rt_cpu_seq_ops = {
 533         .start  = rt_cpu_seq_start,
 534         .next   = rt_cpu_seq_next,
 535         .stop   = rt_cpu_seq_stop,
 536         .show   = rt_cpu_seq_show,
 537 };
 538
 539
 540 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 541 {
 542         return seq_open(file, &rt_cpu_seq_ops);
 543 }
 544
 545 static const struct file_operations rt_cpu_seq_fops = {
 546         .owner   = THIS_MODULE,
 547         .open    = rt_cpu_seq_open,
 548         .read    = seq_read,
 549         .llseek  = seq_lseek,
 550         .release = seq_release,
 551 };
 552
 553 #ifdef CONFIG_IP_ROUTE_CLASSID
 554 static int rt_acct_proc_show(struct seq_file *m, void *v)
 555 {
 556         struct ip_rt_acct *dst, *src;
 557         unsigned int i, j;
 558
 559         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 560         if (!dst)
 561                 return -ENOMEM;
 562
 563         for_each_possible_cpu(i) {
 564                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 565                 for (j = 0; j < 256; j++) {
 566                         dst[j].o_bytes   += src[j].o_bytes;
 567                         dst[j].o_packets += src[j].o_packets;
 568                         dst[j].i_bytes   += src[j].i_bytes;
 569                         dst[j].i_packets += src[j].i_packets;
 570                 }
 571         }
 572
 573         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 574         kfree(dst);
 575         return 0;
 576 }
 577
 578 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 579 {
 580         return single_open(file, rt_acct_proc_show, NULL);
 581 }
 582
 583 static const struct file_operations rt_acct_proc_fops = {
 584         .owner          = THIS_MODULE,
 585         .open           = rt_acct_proc_open,
 586         .read           = seq_read,
 587         .llseek         = seq_lseek,
 588         .release        = single_release,
 589 };
 590 #endif
 591
 592 static int __net_init ip_rt_do_proc_init(struct net *net)
 593 {
 594         struct proc_dir_entry *pde;
 595
 596         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 597                         &rt_cache_seq_fops);
 598         if (!pde)
 599                 goto err1;
 600
 601         pde = proc_create("rt_cache", S_IRUGO,
 602                           net->proc_net_stat, &rt_cpu_seq_fops);
 603         if (!pde)
 604                 goto err2;
 605
 606 #ifdef CONFIG_IP_ROUTE_CLASSID
 607         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 608         if (!pde)
 609                 goto err3;
 610 #endif
 611         return 0;
 612
 613 #ifdef CONFIG_IP_ROUTE_CLASSID
 614 err3:
 615         remove_proc_entry("rt_cache", net->proc_net_stat);
 616 #endif
 617 err2:
 618         remove_proc_entry("rt_cache", net->proc_net);
 619 err1:
 620         return -ENOMEM;
 621 }
 622
 623 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 624 {
 625         remove_proc_entry("rt_cache", net->proc_net_stat);
 626         remove_proc_entry("rt_cache", net->proc_net);
 627 #ifdef CONFIG_IP_ROUTE_CLASSID
 628         remove_proc_entry("rt_acct", net->proc_net);
 629 #endif
 630 }
 631
 632 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 633         .init = ip_rt_do_proc_init,
 634         .exit = ip_rt_do_proc_exit,
 635 };
 636
 637 static int __init ip_rt_proc_init(void)
 638 {
 639         return register_pernet_subsys(&ip_rt_proc_ops);
 640 }
 641
 642 #else
 643 static inline int ip_rt_proc_init(void)
 644 {
 645         return 0;
 646 }
 647 #endif /* CONFIG_PROC_FS */
 648
 649 static inline void rt_free(struct rtable *rt)
 650 {
 651         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 652 }
 653
 654 static inline void rt_drop(struct rtable *rt)
 655 {
 656         ip_rt_put(rt);
 657         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 658 }
 659
 660 static inline int rt_fast_clean(struct rtable *rth)
 661 {
 662         /* Kill broadcast/multicast entries very aggresively, if they
 663            collide in hash table with more useful entries */
 664         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 665                 rt_is_input_route(rth) && rth->dst.rt_next;
 666 }
 667
 668 static inline int rt_valuable(struct rtable *rth)
 669 {
 670         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 671                 rth->dst.expires;
 672 }
 673
 674 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 675 {
 676         unsigned long age;
 677         int ret = 0;
 678
 679         if (atomic_read(&rth->dst.__refcnt))
 680                 goto out;
 681
 682         ret = 1;
 683         if (rth->dst.expires &&
 684             time_after_eq(jiffies, rth->dst.expires))
 685                 goto out;
 686
 687         age = jiffies - rth->dst.lastuse;
 688         ret = 0;
 689         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 690             (age <= tmo2 && rt_valuable(rth)))
 691                 goto out;
 692         ret = 1;
 693 out:    return ret;
 694 }
 695
 696 /* Bits of score are:
 697  * 31: very valuable
 698  * 30: not quite useless
 699  * 29..0: usage counter
 700  */
 701 static inline u32 rt_score(struct rtable *rt)
 702 {
 703         u32 score = jiffies - rt->dst.lastuse;
 704
 705         score = ~score & ~(3<<30);
 706
 707         if (rt_valuable(rt))
 708                 score |= (1<<31);
 709
 710         if (rt_is_output_route(rt) ||
 711             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 712                 score |= (1<<30);
 713
 714         return score;
 715 }
 716
 717 static inline bool rt_caching(const struct net *net)
 718 {
 719         return net->ipv4.current_rt_cache_rebuild_count <=
 720                 net->ipv4.sysctl_rt_cache_rebuild_count;
 721 }
 722
 723 static inline bool compare_hash_inputs(const struct flowi *fl1,
 724                                         const struct flowi *fl2)
 725 {
 726         return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
 727                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
 728                 (fl1->iif ^ fl2->iif)) == 0);
 729 }
 730
 731 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 732 {
 733         return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
 734                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
 735                 (fl1->mark ^ fl2->mark) |
 736                 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
 737                 (fl1->oif ^ fl2->oif) |
 738                 (fl1->iif ^ fl2->iif)) == 0;
 739 }
 740
 741 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 742 {
 743         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 744 }
 745
 746 static inline int rt_is_expired(struct rtable *rth)
 747 {
 748         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 749 }
 750
 751 /*
 752  * Perform a full scan of hash table and free all entries.
 753  * Can be called by a softirq or a process.
 754  * In the later case, we want to be reschedule if necessary
 755  */
 756 static void rt_do_flush(struct net *net, int process_context)
 757 {
 758         unsigned int i;
 759         struct rtable *rth, *next;
 760
 761         for (i = 0; i <= rt_hash_mask; i++) {
 762                 struct rtable __rcu **pprev;
 763                 struct rtable *list;
 764
 765                 if (process_context && need_resched())
 766                         cond_resched();
 767                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
 768                 if (!rth)
 769                         continue;
 770
 771                 spin_lock_bh(rt_hash_lock_addr(i));
 772
 773                 list = NULL;
 774                 pprev = &rt_hash_table[i].chain;
 775                 rth = rcu_dereference_protected(*pprev,
 776                         lockdep_is_held(rt_hash_lock_addr(i)));
 777
 778                 while (rth) {
 779                         next = rcu_dereference_protected(rth->dst.rt_next,
 780                                 lockdep_is_held(rt_hash_lock_addr(i)));
 781
 782                         if (!net ||
 783                             net_eq(dev_net(rth->dst.dev), net)) {
 784                                 rcu_assign_pointer(*pprev, next);
 785                                 rcu_assign_pointer(rth->dst.rt_next, list);
 786                                 list = rth;
 787                         } else {
 788                                 pprev = &rth->dst.rt_next;
 789                         }
 790                         rth = next;
 791                 }
 792
 793                 spin_unlock_bh(rt_hash_lock_addr(i));
 794
 795                 for (; list; list = next) {
 796                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 797                         rt_free(list);
 798                 }
 799         }
 800 }
 801
 802 /*
 803  * While freeing expired entries, we compute average chain length
 804  * and standard deviation, using fixed-point arithmetic.
 805  * This to have an estimation of rt_chain_length_max
 806  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 807  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 808  */
 809
 810 #define FRACT_BITS 3
 811 #define ONE (1UL << FRACT_BITS)
 812
 813 /*
 814  * Given a hash chain and an item in this hash chain,
 815  * find if a previous entry has the same hash_inputs
 816  * (but differs on tos, mark or oif)
 817  * Returns 0 if an alias is found.
 818  * Returns ONE if rth has no alias before itself.
 819  */
 820 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 821 {
 822         const struct rtable *aux = head;
 823
 824         while (aux != rth) {
 825                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 826                         return 0;
 827                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 828         }
 829         return ONE;
 830 }
 831
 832 static void rt_check_expire(void)
 833 {
 834         static unsigned int rover;
 835         unsigned int i = rover, goal;
 836         struct rtable *rth;
 837         struct rtable __rcu **rthp;
 838         unsigned long samples = 0;
 839         unsigned long sum = 0, sum2 = 0;
 840         unsigned long delta;
 841         u64 mult;
 842
 843         delta = jiffies - expires_ljiffies;
 844         expires_ljiffies = jiffies;
 845         mult = ((u64)delta) << rt_hash_log;
 846         if (ip_rt_gc_timeout > 1)
 847                 do_div(mult, ip_rt_gc_timeout);
 848         goal = (unsigned int)mult;
 849         if (goal > rt_hash_mask)
 850                 goal = rt_hash_mask + 1;
 851         for (; goal > 0; goal--) {
 852                 unsigned long tmo = ip_rt_gc_timeout;
 853                 unsigned long length;
 854
 855                 i = (i + 1) & rt_hash_mask;
 856                 rthp = &rt_hash_table[i].chain;
 857
 858                 if (need_resched())
 859                         cond_resched();
 860
 861                 samples++;
 862
 863                 if (rcu_dereference_raw(*rthp) == NULL)
 864                         continue;
 865                 length = 0;
 866                 spin_lock_bh(rt_hash_lock_addr(i));
 867                 while ((rth = rcu_dereference_protected(*rthp,
 868                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 869                         prefetch(rth->dst.rt_next);
 870                         if (rt_is_expired(rth)) {
 871                                 *rthp = rth->dst.rt_next;
 872                                 rt_free(rth);
 873                                 continue;
 874                         }
 875                         if (rth->dst.expires) {
 876                                 /* Entry is expired even if it is in use */
 877                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 878 nofree:
 879                                         tmo >>= 1;
 880                                         rthp = &rth->dst.rt_next;
 881                                         /*
 882                                          * We only count entries on
 883                                          * a chain with equal hash inputs once
 884                                          * so that entries for different QOS
 885                                          * levels, and other non-hash input
 886                                          * attributes don't unfairly skew
 887                                          * the length computation
 888                                          */
 889                                         length += has_noalias(rt_hash_table[i].chain, rth);
 890                                         continue;
 891                                 }
 892                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 893                                 goto nofree;
 894
 895                         /* Cleanup aged off entries. */
 896                         *rthp = rth->dst.rt_next;
 897                         rt_free(rth);
 898                 }
 899                 spin_unlock_bh(rt_hash_lock_addr(i));
 900                 sum += length;
 901                 sum2 += length*length;
 902         }
 903         if (samples) {
 904                 unsigned long avg = sum / samples;
 905                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 906                 rt_chain_length_max = max_t(unsigned long,
 907                                         ip_rt_gc_elasticity,
 908                                         (avg + 4*sd) >> FRACT_BITS);
 909         }
 910         rover = i;
 911 }
 912
 913 /*
 914  * rt_worker_func() is run in process context.
 915  * we call rt_check_expire() to scan part of the hash table
 916  */
 917 static void rt_worker_func(struct work_struct *work)
 918 {
 919         rt_check_expire();
 920         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 921 }
 922
 923 /*
 924  * Pertubation of rt_genid by a small quantity [1..256]
 925  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 926  * many times (2^24) without giving recent rt_genid.
 927  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 928  */
 929 static void rt_cache_invalidate(struct net *net)
 930 {
 931         unsigned char shuffle;
 932
 933         get_random_bytes(&shuffle, sizeof(shuffle));
 934         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 935 }
 936
 937 /*
 938  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 939  * delay >= 0 : invalidate & flush cache (can be long)
 940  */
 941 void rt_cache_flush(struct net *net, int delay)
 942 {
 943         rt_cache_invalidate(net);
 944         if (delay >= 0)
 945                 rt_do_flush(net, !in_softirq());
 946 }
 947
 948 /* Flush previous cache invalidated entries from the cache */
 949 void rt_cache_flush_batch(struct net *net)
 950 {
 951         rt_do_flush(net, !in_softirq());
 952 }
 953
 954 static void rt_emergency_hash_rebuild(struct net *net)
 955 {
 956         if (net_ratelimit())
 957                 printk(KERN_WARNING "Route hash chain too long!\n");
 958         rt_cache_invalidate(net);
 959 }
 960
 961 /*
 962    Short description of GC goals.
 963
 964    We want to build algorithm, which will keep routing cache
 965    at some equilibrium point, when number of aged off entries
 966    is kept approximately equal to newly generated ones.
 967
 968    Current expiration strength is variable "expire".
 969    We try to adjust it dynamically, so that if networking
 970    is idle expires is large enough to keep enough of warm entries,
 971    and when load increases it reduces to limit cache size.
 972  */
 973
 974 static int rt_garbage_collect(struct dst_ops *ops)
 975 {
 976         static unsigned long expire = RT_GC_TIMEOUT;
 977         static unsigned long last_gc;
 978         static int rover;
 979         static int equilibrium;
 980         struct rtable *rth;
 981         struct rtable __rcu **rthp;
 982         unsigned long now = jiffies;
 983         int goal;
 984         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 985
 986         /*
 987          * Garbage collection is pretty expensive,
 988          * do not make it too frequently.
 989          */
 990
 991         RT_CACHE_STAT_INC(gc_total);
 992
 993         if (now - last_gc < ip_rt_gc_min_interval &&
 994             entries < ip_rt_max_size) {
 995                 RT_CACHE_STAT_INC(gc_ignored);
 996                 goto out;
 997         }
 998
 999         entries = dst_entries_get_slow(&ipv4_dst_ops);
1000         /* Calculate number of entries, which we want to expire now. */
1001         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1002         if (goal <= 0) {
1003                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1004                         equilibrium = ipv4_dst_ops.gc_thresh;
1005                 goal = entries - equilibrium;
1006                 if (goal > 0) {
1007                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1008                         goal = entries - equilibrium;
1009                 }
1010         } else {
1011                 /* We are in dangerous area. Try to reduce cache really
1012                  * aggressively.
1013                  */
1014                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1015                 equilibrium = entries - goal;
1016         }
1017
1018         if (now - last_gc >= ip_rt_gc_min_interval)
1019                 last_gc = now;
1020
1021         if (goal <= 0) {
1022                 equilibrium += goal;
1023                 goto work_done;
1024         }
1025
1026         do {
1027                 int i, k;
1028
1029                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1030                         unsigned long tmo = expire;
1031
1032                         k = (k + 1) & rt_hash_mask;
1033                         rthp = &rt_hash_table[k].chain;
1034                         spin_lock_bh(rt_hash_lock_addr(k));
1035                         while ((rth = rcu_dereference_protected(*rthp,
1036                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1037                                 if (!rt_is_expired(rth) &&
1038                                         !rt_may_expire(rth, tmo, expire)) {
1039                                         tmo >>= 1;
1040                                         rthp = &rth->dst.rt_next;
1041                                         continue;
1042                                 }
1043                                 *rthp = rth->dst.rt_next;
1044                                 rt_free(rth);
1045                                 goal--;
1046                         }
1047                         spin_unlock_bh(rt_hash_lock_addr(k));
1048                         if (goal <= 0)
1049                                 break;
1050                 }
1051                 rover = k;
1052
1053                 if (goal <= 0)
1054                         goto work_done;
1055
1056                 /* Goal is not achieved. We stop process if:
1057
1058                    - if expire reduced to zero. Otherwise, expire is halfed.
1059                    - if table is not full.
1060                    - if we are called from interrupt.
1061                    - jiffies check is just fallback/debug loop breaker.
1062                      We will not spin here for long time in any case.
1063                  */
1064
1065                 RT_CACHE_STAT_INC(gc_goal_miss);
1066
1067                 if (expire == 0)
1068                         break;
1069
1070                 expire >>= 1;
1071 #if RT_CACHE_DEBUG >= 2
1072                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1073                                 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1074 #endif
1075
1076                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1077                         goto out;
1078         } while (!in_softirq() && time_before_eq(jiffies, now));
1079
1080         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1081                 goto out;
1082         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1083                 goto out;
1084         if (net_ratelimit())
1085                 printk(KERN_WARNING "dst cache overflow\n");
1086         RT_CACHE_STAT_INC(gc_dst_overflow);
1087         return 1;
1088
1089 work_done:
1090         expire += ip_rt_gc_min_interval;
1091         if (expire > ip_rt_gc_timeout ||
1092             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1093             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1094                 expire = ip_rt_gc_timeout;
1095 #if RT_CACHE_DEBUG >= 2
1096         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1097                         dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1098 #endif
1099 out:    return 0;
1100 }
1101
1102 /*
1103  * Returns number of entries in a hash chain that have different hash_inputs
1104  */
1105 static int slow_chain_length(const struct rtable *head)
1106 {
1107         int length = 0;
1108         const struct rtable *rth = head;
1109
1110         while (rth) {
1111                 length += has_noalias(head, rth);
1112                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1113         }
1114         return length >> FRACT_BITS;
1115 }
1116
1117 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1118                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1119 {
1120         struct rtable   *rth, *cand;
1121         struct rtable __rcu **rthp, **candp;
1122         unsigned long   now;
1123         u32             min_score;
1124         int             chain_length;
1125         int attempts = !in_softirq();
1126
1127 restart:
1128         chain_length = 0;
1129         min_score = ~(u32)0;
1130         cand = NULL;
1131         candp = NULL;
1132         now = jiffies;
1133
1134         if (!rt_caching(dev_net(rt->dst.dev))) {
1135                 /*
1136                  * If we're not caching, just tell the caller we
1137                  * were successful and don't touch the route.  The
1138                  * caller hold the sole reference to the cache entry, and
1139                  * it will be released when the caller is done with it.
1140                  * If we drop it here, the callers have no way to resolve routes
1141                  * when we're not caching.  Instead, just point *rp at rt, so
1142                  * the caller gets a single use out of the route
1143                  * Note that we do rt_free on this new route entry, so that
1144                  * once its refcount hits zero, we are still able to reap it
1145                  * (Thanks Alexey)
1146                  * Note: To avoid expensive rcu stuff for this uncached dst,
1147                  * we set DST_NOCACHE so that dst_release() can free dst without
1148                  * waiting a grace period.
1149                  */
1150
1151                 rt->dst.flags |= DST_NOCACHE;
1152                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1153                         int err = arp_bind_neighbour(&rt->dst);
1154                         if (err) {
1155                                 if (net_ratelimit())
1156                                         printk(KERN_WARNING
1157                                             "Neighbour table failure & not caching routes.\n");
1158                                 ip_rt_put(rt);
1159                                 return err;
1160                         }
1161                 }
1162
1163                 goto skip_hashing;
1164         }
1165
1166         rthp = &rt_hash_table[hash].chain;
1167
1168         spin_lock_bh(rt_hash_lock_addr(hash));
1169         while ((rth = rcu_dereference_protected(*rthp,
1170                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1171                 if (rt_is_expired(rth)) {
1172                         *rthp = rth->dst.rt_next;
1173                         rt_free(rth);
1174                         continue;
1175                 }
1176                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1177                         /* Put it first */
1178                         *rthp = rth->dst.rt_next;
1179                         /*
1180                          * Since lookup is lockfree, the deletion
1181                          * must be visible to another weakly ordered CPU before
1182                          * the insertion at the start of the hash chain.
1183                          */
1184                         rcu_assign_pointer(rth->dst.rt_next,
1185                                            rt_hash_table[hash].chain);
1186                         /*
1187                          * Since lookup is lockfree, the update writes
1188                          * must be ordered for consistency on SMP.
1189                          */
1190                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1191
1192                         dst_use(&rth->dst, now);
1193                         spin_unlock_bh(rt_hash_lock_addr(hash));
1194
1195                         rt_drop(rt);
1196                         if (rp)
1197                                 *rp = rth;
1198                         else
1199                                 skb_dst_set(skb, &rth->dst);
1200                         return 0;
1201                 }
1202
1203                 if (!atomic_read(&rth->dst.__refcnt)) {
1204                         u32 score = rt_score(rth);
1205
1206                         if (score <= min_score) {
1207                                 cand = rth;
1208                                 candp = rthp;
1209                                 min_score = score;
1210                         }
1211                 }
1212
1213                 chain_length++;
1214
1215                 rthp = &rth->dst.rt_next;
1216         }
1217
1218         if (cand) {
1219                 /* ip_rt_gc_elasticity used to be average length of chain
1220                  * length, when exceeded gc becomes really aggressive.
1221                  *
1222                  * The second limit is less certain. At the moment it allows
1223                  * only 2 entries per bucket. We will see.
1224                  */
1225                 if (chain_length > ip_rt_gc_elasticity) {
1226                         *candp = cand->dst.rt_next;
1227                         rt_free(cand);
1228                 }
1229         } else {
1230                 if (chain_length > rt_chain_length_max &&
1231                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1232                         struct net *net = dev_net(rt->dst.dev);
1233                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1234                         if (!rt_caching(net)) {
1235                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1236                                         rt->dst.dev->name, num);
1237                         }
1238                         rt_emergency_hash_rebuild(net);
1239                         spin_unlock_bh(rt_hash_lock_addr(hash));
1240
1241                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1242                                         ifindex, rt_genid(net));
1243                         goto restart;
1244                 }
1245         }
1246
1247         /* Try to bind route to arp only if it is output
1248            route or unicast forwarding path.
1249          */
1250         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1251                 int err = arp_bind_neighbour(&rt->dst);
1252                 if (err) {
1253                         spin_unlock_bh(rt_hash_lock_addr(hash));
1254
1255                         if (err != -ENOBUFS) {
1256                                 rt_drop(rt);
1257                                 return err;
1258                         }
1259
1260                         /* Neighbour tables are full and nothing
1261                            can be released. Try to shrink route cache,
1262                            it is most likely it holds some neighbour records.
1263                          */
1264                         if (attempts-- > 0) {
1265                                 int saved_elasticity = ip_rt_gc_elasticity;
1266                                 int saved_int = ip_rt_gc_min_interval;
1267                                 ip_rt_gc_elasticity     = 1;
1268                                 ip_rt_gc_min_interval   = 0;
1269                                 rt_garbage_collect(&ipv4_dst_ops);
1270                                 ip_rt_gc_min_interval   = saved_int;
1271                                 ip_rt_gc_elasticity     = saved_elasticity;
1272                                 goto restart;
1273                         }
1274
1275                         if (net_ratelimit())
1276                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1277                         rt_drop(rt);
1278                         return -ENOBUFS;
1279                 }
1280         }
1281
1282         rt->dst.rt_next = rt_hash_table[hash].chain;
1283
1284 #if RT_CACHE_DEBUG >= 2
1285         if (rt->dst.rt_next) {
1286                 struct rtable *trt;
1287                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1288                        hash, &rt->rt_dst);
1289                 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1290                         printk(" . %pI4", &trt->rt_dst);
1291                 printk("\n");
1292         }
1293 #endif
1294         /*
1295          * Since lookup is lockfree, we must make sure
1296          * previous writes to rt are comitted to memory
1297          * before making rt visible to other CPUS.
1298          */
1299         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1300
1301         spin_unlock_bh(rt_hash_lock_addr(hash));
1302
1303 skip_hashing:
1304         if (rp)
1305                 *rp = rt;
1306         else
1307                 skb_dst_set(skb, &rt->dst);
1308         return 0;
1309 }
1310
1311 void rt_bind_peer(struct rtable *rt, int create)
1312 {
1313         struct inet_peer *peer;
1314
1315         peer = inet_getpeer_v4(rt->rt_dst, create);
1316
1317         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1318                 inet_putpeer(peer);
1319 }
1320
1321 /*
1322  * Peer allocation may fail only in serious out-of-memory conditions.  However
1323  * we still can generate some output.
1324  * Random ID selection looks a bit dangerous because we have no chances to
1325  * select ID being unique in a reasonable period of time.
1326  * But broken packet identifier may be better than no packet at all.
1327  */
1328 static void ip_select_fb_ident(struct iphdr *iph)
1329 {
1330         static DEFINE_SPINLOCK(ip_fb_id_lock);
1331         static u32 ip_fallback_id;
1332         u32 salt;
1333
1334         spin_lock_bh(&ip_fb_id_lock);
1335         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1336         iph->id = htons(salt & 0xFFFF);
1337         ip_fallback_id = salt;
1338         spin_unlock_bh(&ip_fb_id_lock);
1339 }
1340
1341 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1342 {
1343         struct rtable *rt = (struct rtable *) dst;
1344
1345         if (rt) {
1346                 if (rt->peer == NULL)
1347                         rt_bind_peer(rt, 1);
1348
1349                 /* If peer is attached to destination, it is never detached,
1350                    so that we need not to grab a lock to dereference it.
1351                  */
1352                 if (rt->peer) {
1353                         iph->id = htons(inet_getid(rt->peer, more));
1354                         return;
1355                 }
1356         } else
1357                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1358                        __builtin_return_address(0));
1359
1360         ip_select_fb_ident(iph);
1361 }
1362 EXPORT_SYMBOL(__ip_select_ident);
1363
1364 static void rt_del(unsigned hash, struct rtable *rt)
1365 {
1366         struct rtable __rcu **rthp;
1367         struct rtable *aux;
1368
1369         rthp = &rt_hash_table[hash].chain;
1370         spin_lock_bh(rt_hash_lock_addr(hash));
1371         ip_rt_put(rt);
1372         while ((aux = rcu_dereference_protected(*rthp,
1373                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1374                 if (aux == rt || rt_is_expired(aux)) {
1375                         *rthp = aux->dst.rt_next;
1376                         rt_free(aux);
1377                         continue;
1378                 }
1379                 rthp = &aux->dst.rt_next;
1380         }
1381         spin_unlock_bh(rt_hash_lock_addr(hash));
1382 }
1383
1384 /* called in rcu_read_lock() section */
1385 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1386                     __be32 saddr, struct net_device *dev)
1387 {
1388         int i, k;
1389         struct in_device *in_dev = __in_dev_get_rcu(dev);
1390         struct rtable *rth;
1391         struct rtable __rcu **rthp;
1392         __be32  skeys[2] = { saddr, 0 };
1393         int  ikeys[2] = { dev->ifindex, 0 };
1394         struct netevent_redirect netevent;
1395         struct net *net;
1396
1397         if (!in_dev)
1398                 return;
1399
1400         net = dev_net(dev);
1401         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1402             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1403             ipv4_is_zeronet(new_gw))
1404                 goto reject_redirect;
1405
1406         if (!rt_caching(net))
1407                 goto reject_redirect;
1408
1409         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1410                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1411                         goto reject_redirect;
1412                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1413                         goto reject_redirect;
1414         } else {
1415                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1416                         goto reject_redirect;
1417         }
1418
1419         for (i = 0; i < 2; i++) {
1420                 for (k = 0; k < 2; k++) {
1421                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1422                                                 rt_genid(net));
1423
1424                         rthp = &rt_hash_table[hash].chain;
1425
1426                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1427                                 struct rtable *rt;
1428
1429                                 if (rth->fl.fl4_dst != daddr ||
1430                                     rth->fl.fl4_src != skeys[i] ||
1431                                     rth->fl.oif != ikeys[k] ||
1432                                     rt_is_input_route(rth) ||
1433                                     rt_is_expired(rth) ||
1434                                     !net_eq(dev_net(rth->dst.dev), net)) {
1435                                         rthp = &rth->dst.rt_next;
1436                                         continue;
1437                                 }
1438
1439                                 if (rth->rt_dst != daddr ||
1440                                     rth->rt_src != saddr ||
1441                                     rth->dst.error ||
1442                                     rth->rt_gateway != old_gw ||
1443                                     rth->dst.dev != dev)
1444                                         break;
1445
1446                                 dst_hold(&rth->dst);
1447
1448                                 rt = dst_alloc(&ipv4_dst_ops);
1449                                 if (rt == NULL) {
1450                                         ip_rt_put(rth);
1451                                         return;
1452                                 }
1453
1454                                 /* Copy all the information. */
1455                                 *rt = *rth;
1456                                 rt->dst.__use           = 1;
1457                                 atomic_set(&rt->dst.__refcnt, 1);
1458                                 rt->dst.child           = NULL;
1459                                 if (rt->dst.dev)
1460                                         dev_hold(rt->dst.dev);
1461                                 rt->dst.obsolete        = -1;
1462                                 rt->dst.lastuse = jiffies;
1463                                 rt->dst.path            = &rt->dst;
1464                                 rt->dst.neighbour       = NULL;
1465                                 rt->dst.hh              = NULL;
1466 #ifdef CONFIG_XFRM
1467                                 rt->dst.xfrm            = NULL;
1468 #endif
1469                                 rt->rt_genid            = rt_genid(net);
1470                                 rt->rt_flags            |= RTCF_REDIRECTED;
1471
1472                                 /* Gateway is different ... */
1473                                 rt->rt_gateway          = new_gw;
1474
1475                                 /* Redirect received -> path was valid */
1476                                 dst_confirm(&rth->dst);
1477
1478                                 if (rt->peer)
1479                                         atomic_inc(&rt->peer->refcnt);
1480                                 if (rt->fi)
1481                                         atomic_inc(&rt->fi->fib_clntref);
1482
1483                                 if (arp_bind_neighbour(&rt->dst) ||
1484                                     !(rt->dst.neighbour->nud_state &
1485                                             NUD_VALID)) {
1486                                         if (rt->dst.neighbour)
1487                                                 neigh_event_send(rt->dst.neighbour, NULL);
1488                                         ip_rt_put(rth);
1489                                         rt_drop(rt);
1490                                         goto do_next;
1491                                 }
1492
1493                                 netevent.old = &rth->dst;
1494                                 netevent.new = &rt->dst;
1495                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1496                                                         &netevent);
1497
1498                                 rt_del(hash, rth);
1499                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1500                                         ip_rt_put(rt);
1501                                 goto do_next;
1502                         }
1503                 do_next:
1504                         ;
1505                 }
1506         }
1507         return;
1508
1509 reject_redirect:
1510 #ifdef CONFIG_IP_ROUTE_VERBOSE
1511         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1512                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1513                         "  Advised path = %pI4 -> %pI4\n",
1514                        &old_gw, dev->name, &new_gw,
1515                        &saddr, &daddr);
1516 #endif
1517         ;
1518 }
1519
1520 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1521 {
1522         struct rtable *rt = (struct rtable *)dst;
1523         struct dst_entry *ret = dst;
1524
1525         if (rt) {
1526                 if (dst->obsolete > 0) {
1527                         ip_rt_put(rt);
1528                         ret = NULL;
1529                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1530                            (rt->dst.expires &&
1531                             time_after_eq(jiffies, rt->dst.expires))) {
1532                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1533                                                 rt->fl.oif,
1534                                                 rt_genid(dev_net(dst->dev)));
1535 #if RT_CACHE_DEBUG >= 1
1536                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1537                                 &rt->rt_dst, rt->fl.fl4_tos);
1538 #endif
1539                         rt_del(hash, rt);
1540                         ret = NULL;
1541                 }
1542         }
1543         return ret;
1544 }
1545
1546 /*
1547  * Algorithm:
1548  *      1. The first ip_rt_redirect_number redirects are sent
1549  *         with exponential backoff, then we stop sending them at all,
1550  *         assuming that the host ignores our redirects.
1551  *      2. If we did not see packets requiring redirects
1552  *         during ip_rt_redirect_silence, we assume that the host
1553  *         forgot redirected route and start to send redirects again.
1554  *
1555  * This algorithm is much cheaper and more intelligent than dumb load limiting
1556  * in icmp.c.
1557  *
1558  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1559  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1560  */
1561
1562 void ip_rt_send_redirect(struct sk_buff *skb)
1563 {
1564         struct rtable *rt = skb_rtable(skb);
1565         struct in_device *in_dev;
1566         struct inet_peer *peer;
1567         int log_martians;
1568
1569         rcu_read_lock();
1570         in_dev = __in_dev_get_rcu(rt->dst.dev);
1571         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1572                 rcu_read_unlock();
1573                 return;
1574         }
1575         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1576         rcu_read_unlock();
1577
1578         if (!rt->peer)
1579                 rt_bind_peer(rt, 1);
1580         peer = rt->peer;
1581         if (!peer) {
1582                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1583                 return;
1584         }
1585
1586         /* No redirected packets during ip_rt_redirect_silence;
1587          * reset the algorithm.
1588          */
1589         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1590                 peer->rate_tokens = 0;
1591
1592         /* Too many ignored redirects; do not send anything
1593          * set dst.rate_last to the last seen redirected packet.
1594          */
1595         if (peer->rate_tokens >= ip_rt_redirect_number) {
1596                 peer->rate_last = jiffies;
1597                 return;
1598         }
1599
1600         /* Check for load limit; set rate_last to the latest sent
1601          * redirect.
1602          */
1603         if (peer->rate_tokens == 0 ||
1604             time_after(jiffies,
1605                        (peer->rate_last +
1606                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1607                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1608                 peer->rate_last = jiffies;
1609                 ++peer->rate_tokens;
1610 #ifdef CONFIG_IP_ROUTE_VERBOSE
1611                 if (log_martians &&
1612                     peer->rate_tokens == ip_rt_redirect_number &&
1613                     net_ratelimit())
1614                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1615                                 &rt->rt_src, rt->rt_iif,
1616                                 &rt->rt_dst, &rt->rt_gateway);
1617 #endif
1618         }
1619 }
1620
1621 static int ip_error(struct sk_buff *skb)
1622 {
1623         struct rtable *rt = skb_rtable(skb);
1624         struct inet_peer *peer;
1625         unsigned long now;
1626         bool send;
1627         int code;
1628
1629         switch (rt->dst.error) {
1630                 case EINVAL:
1631                 default:
1632                         goto out;
1633                 case EHOSTUNREACH:
1634                         code = ICMP_HOST_UNREACH;
1635                         break;
1636                 case ENETUNREACH:
1637                         code = ICMP_NET_UNREACH;
1638                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1639                                         IPSTATS_MIB_INNOROUTES);
1640                         break;
1641                 case EACCES:
1642                         code = ICMP_PKT_FILTERED;
1643                         break;
1644         }
1645
1646         if (!rt->peer)
1647                 rt_bind_peer(rt, 1);
1648         peer = rt->peer;
1649
1650         send = true;
1651         if (peer) {
1652                 now = jiffies;
1653                 peer->rate_tokens += now - peer->rate_last;
1654                 if (peer->rate_tokens > ip_rt_error_burst)
1655                         peer->rate_tokens = ip_rt_error_burst;
1656                 peer->rate_last = now;
1657                 if (peer->rate_tokens >= ip_rt_error_cost)
1658                         peer->rate_tokens -= ip_rt_error_cost;
1659                 else
1660                         send = false;
1661         }
1662         if (send)
1663                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1664
1665 out:    kfree_skb(skb);
1666         return 0;
1667 }
1668
1669 /*
1670  *      The last two values are not from the RFC but
1671  *      are needed for AMPRnet AX.25 paths.
1672  */
1673
1674 static const unsigned short mtu_plateau[] =
1675 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1676
1677 static inline unsigned short guess_mtu(unsigned short old_mtu)
1678 {
1679         int i;
1680
1681         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1682                 if (old_mtu > mtu_plateau[i])
1683                         return mtu_plateau[i];
1684         return 68;
1685 }
1686
1687 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1688                                  unsigned short new_mtu,
1689                                  struct net_device *dev)
1690 {
1691         int i, k;
1692         unsigned short old_mtu = ntohs(iph->tot_len);
1693         struct rtable *rth;
1694         int  ikeys[2] = { dev->ifindex, 0 };
1695         __be32  skeys[2] = { iph->saddr, 0, };
1696         __be32  daddr = iph->daddr;
1697         unsigned short est_mtu = 0;
1698
1699         for (k = 0; k < 2; k++) {
1700                 for (i = 0; i < 2; i++) {
1701                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1702                                                 rt_genid(net));
1703
1704                         rcu_read_lock();
1705                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1706                              rth = rcu_dereference(rth->dst.rt_next)) {
1707                                 unsigned short mtu = new_mtu;
1708
1709                                 if (rth->fl.fl4_dst != daddr ||
1710                                     rth->fl.fl4_src != skeys[i] ||
1711                                     rth->rt_dst != daddr ||
1712                                     rth->rt_src != iph->saddr ||
1713                                     rth->fl.oif != ikeys[k] ||
1714                                     rt_is_input_route(rth) ||
1715                                     dst_metric_locked(&rth->dst, RTAX_MTU) ||
1716                                     !net_eq(dev_net(rth->dst.dev), net) ||
1717                                     rt_is_expired(rth))
1718                                         continue;
1719
1720                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1721
1722                                         /* BSD 4.2 compatibility hack :-( */
1723                                         if (mtu == 0 &&
1724                                             old_mtu >= dst_mtu(&rth->dst) &&
1725                                             old_mtu >= 68 + (iph->ihl << 2))
1726                                                 old_mtu -= iph->ihl << 2;
1727
1728                                         mtu = guess_mtu(old_mtu);
1729                                 }
1730                                 if (mtu <= dst_mtu(&rth->dst)) {
1731                                         if (mtu < dst_mtu(&rth->dst)) {
1732                                                 dst_confirm(&rth->dst);
1733                                                 if (mtu < ip_rt_min_pmtu) {
1734                                                         u32 lock = dst_metric(&rth->dst,
1735                                                                               RTAX_LOCK);
1736                                                         mtu = ip_rt_min_pmtu;
1737                                                         lock |= (1 << RTAX_MTU);
1738                                                         dst_metric_set(&rth->dst, RTAX_LOCK,
1739                                                                        lock);
1740                                                 }
1741                                                 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1742                                                 dst_set_expires(&rth->dst,
1743                                                         ip_rt_mtu_expires);
1744                                         }
1745                                         est_mtu = mtu;
1746                                 }
1747                         }
1748                         rcu_read_unlock();
1749                 }
1750         }
1751         return est_mtu ? : new_mtu;
1752 }
1753
1754 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1755 {
1756         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1757             !(dst_metric_locked(dst, RTAX_MTU))) {
1758                 if (mtu < ip_rt_min_pmtu) {
1759                         u32 lock = dst_metric(dst, RTAX_LOCK);
1760                         mtu = ip_rt_min_pmtu;
1761                         dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1762                 }
1763                 dst_metric_set(dst, RTAX_MTU, mtu);
1764                 dst_set_expires(dst, ip_rt_mtu_expires);
1765         }
1766 }
1767
1768 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1769 {
1770         if (rt_is_expired((struct rtable *)dst))
1771                 return NULL;
1772         return dst;
1773 }
1774
1775 static void ipv4_dst_destroy(struct dst_entry *dst)
1776 {
1777         struct rtable *rt = (struct rtable *) dst;
1778         struct inet_peer *peer = rt->peer;
1779
1780         if (rt->fi) {
1781                 fib_info_put(rt->fi);
1782                 rt->fi = NULL;
1783         }
1784         if (peer) {
1785                 rt->peer = NULL;
1786                 inet_putpeer(peer);
1787         }
1788 }
1789
1790
1791 static void ipv4_link_failure(struct sk_buff *skb)
1792 {
1793         struct rtable *rt;
1794
1795         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1796
1797         rt = skb_rtable(skb);
1798         if (rt)
1799                 dst_set_expires(&rt->dst, 0);
1800 }
1801
1802 static int ip_rt_bug(struct sk_buff *skb)
1803 {
1804         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1805                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1806                 skb->dev ? skb->dev->name : "?");
1807         kfree_skb(skb);
1808         return 0;
1809 }
1810
1811 /*
1812    We do not cache source address of outgoing interface,
1813    because it is used only by IP RR, TS and SRR options,
1814    so that it out of fast path.
1815
1816    BTW remember: "addr" is allowed to be not aligned
1817    in IP options!
1818  */
1819
1820 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1821 {
1822         __be32 src;
1823         struct fib_result res;
1824
1825         if (rt_is_output_route(rt))
1826                 src = rt->rt_src;
1827         else {
1828                 rcu_read_lock();
1829                 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1830                         src = FIB_RES_PREFSRC(res);
1831                 else
1832                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1833                                         RT_SCOPE_UNIVERSE);
1834                 rcu_read_unlock();
1835         }
1836         memcpy(addr, &src, 4);
1837 }
1838
1839 #ifdef CONFIG_IP_ROUTE_CLASSID
1840 static void set_class_tag(struct rtable *rt, u32 tag)
1841 {
1842         if (!(rt->dst.tclassid & 0xFFFF))
1843                 rt->dst.tclassid |= tag & 0xFFFF;
1844         if (!(rt->dst.tclassid & 0xFFFF0000))
1845                 rt->dst.tclassid |= tag & 0xFFFF0000;
1846 }
1847 #endif
1848
1849 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1850 {
1851         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1852
1853         if (advmss == 0) {
1854                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1855                                ip_rt_min_advmss);
1856                 if (advmss > 65535 - 40)
1857                         advmss = 65535 - 40;
1858         }
1859         return advmss;
1860 }
1861
1862 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1863 {
1864         unsigned int mtu = dst->dev->mtu;
1865
1866         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1867                 const struct rtable *rt = (const struct rtable *) dst;
1868
1869                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1870                         mtu = 576;
1871         }
1872
1873         if (mtu > IP_MAX_MTU)
1874                 mtu = IP_MAX_MTU;
1875
1876         return mtu;
1877 }
1878
1879 static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
1880 {
1881         struct inet_peer *peer;
1882         int create = 0;
1883
1884         /* If a peer entry exists for this destination, we must hook
1885          * it up in order to get at cached metrics.
1886          */
1887         if (rt->fl.flags & FLOWI_FLAG_PRECOW_METRICS)
1888                 create = 1;
1889
1890         rt_bind_peer(rt, create);
1891         peer = rt->peer;
1892         if (peer) {
1893                 if (inet_metrics_new(peer))
1894                         memcpy(peer->metrics, fi->fib_metrics,
1895                                sizeof(u32) * RTAX_MAX);
1896                 dst_init_metrics(&rt->dst, peer->metrics, false);
1897         } else {
1898                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1899                         rt->fi = fi;
1900                         atomic_inc(&fi->fib_clntref);
1901                 }
1902                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1903         }
1904 }
1905
1906 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1907 {
1908         struct dst_entry *dst = &rt->dst;
1909         struct fib_info *fi = res->fi;
1910
1911         if (fi) {
1912                 if (FIB_RES_GW(*res) &&
1913                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1914                         rt->rt_gateway = FIB_RES_GW(*res);
1915                 rt_init_metrics(rt, fi);
1916 #ifdef CONFIG_IP_ROUTE_CLASSID
1917                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1918 #endif
1919         }
1920
1921         if (dst_mtu(dst) > IP_MAX_MTU)
1922                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1923         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1924                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1925
1926 #ifdef CONFIG_IP_ROUTE_CLASSID
1927 #ifdef CONFIG_IP_MULTIPLE_TABLES
1928         set_class_tag(rt, fib_rules_tclass(res));
1929 #endif
1930         set_class_tag(rt, itag);
1931 #endif
1932         rt->rt_type = res->type;
1933 }
1934
1935 /* called in rcu_read_lock() section */
1936 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1937                                 u8 tos, struct net_device *dev, int our)
1938 {
1939         unsigned int hash;
1940         struct rtable *rth;
1941         __be32 spec_dst;
1942         struct in_device *in_dev = __in_dev_get_rcu(dev);
1943         u32 itag = 0;
1944         int err;
1945
1946         /* Primary sanity checks. */
1947
1948         if (in_dev == NULL)
1949                 return -EINVAL;
1950
1951         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1952             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1953                 goto e_inval;
1954
1955         if (ipv4_is_zeronet(saddr)) {
1956                 if (!ipv4_is_local_multicast(daddr))
1957                         goto e_inval;
1958                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1959         } else {
1960                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1961                                           &itag, 0);
1962                 if (err < 0)
1963                         goto e_err;
1964         }
1965         rth = dst_alloc(&ipv4_dst_ops);
1966         if (!rth)
1967                 goto e_nobufs;
1968
1969         rth->dst.output = ip_rt_bug;
1970         rth->dst.obsolete = -1;
1971
1972         atomic_set(&rth->dst.__refcnt, 1);
1973         rth->dst.flags= DST_HOST;
1974         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1975                 rth->dst.flags |= DST_NOPOLICY;
1976         rth->fl.fl4_dst = daddr;
1977         rth->rt_dst     = daddr;
1978         rth->fl.fl4_tos = tos;
1979         rth->fl.mark    = skb->mark;
1980         rth->fl.fl4_src = saddr;
1981         rth->rt_src     = saddr;
1982 #ifdef CONFIG_IP_ROUTE_CLASSID
1983         rth->dst.tclassid = itag;
1984 #endif
1985         rth->rt_iif     =
1986         rth->fl.iif     = dev->ifindex;
1987         rth->dst.dev    = init_net.loopback_dev;
1988         dev_hold(rth->dst.dev);
1989         rth->fl.oif     = 0;
1990         rth->rt_gateway = daddr;
1991         rth->rt_spec_dst= spec_dst;
1992         rth->rt_genid   = rt_genid(dev_net(dev));
1993         rth->rt_flags   = RTCF_MULTICAST;
1994         rth->rt_type    = RTN_MULTICAST;
1995         if (our) {
1996                 rth->dst.input= ip_local_deliver;
1997                 rth->rt_flags |= RTCF_LOCAL;
1998         }
1999
2000 #ifdef CONFIG_IP_MROUTE
2001         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2002                 rth->dst.input = ip_mr_input;
2003 #endif
2004         RT_CACHE_STAT_INC(in_slow_mc);
2005
2006         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2007         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
2008
2009 e_nobufs:
2010         return -ENOBUFS;
2011 e_inval:
2012         return -EINVAL;
2013 e_err:
2014         return err;
2015 }
2016
2017
2018 static void ip_handle_martian_source(struct net_device *dev,
2019                                      struct in_device *in_dev,
2020                                      struct sk_buff *skb,
2021                                      __be32 daddr,
2022                                      __be32 saddr)
2023 {
2024         RT_CACHE_STAT_INC(in_martian_src);
2025 #ifdef CONFIG_IP_ROUTE_VERBOSE
2026         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2027                 /*
2028                  *      RFC1812 recommendation, if source is martian,
2029                  *      the only hint is MAC header.
2030                  */
2031                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2032                         &daddr, &saddr, dev->name);
2033                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2034                         int i;
2035                         const unsigned char *p = skb_mac_header(skb);
2036                         printk(KERN_WARNING "ll header: ");
2037                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2038                                 printk("%02x", *p);
2039                                 if (i < (dev->hard_header_len - 1))
2040                                         printk(":");
2041                         }
2042                         printk("\n");
2043                 }
2044         }
2045 #endif
2046 }
2047
2048 /* called in rcu_read_lock() section */
2049 static int __mkroute_input(struct sk_buff *skb,
2050                            struct fib_result *res,
2051                            struct in_device *in_dev,
2052                            __be32 daddr, __be32 saddr, u32 tos,
2053                            struct rtable **result)
2054 {
2055         struct rtable *rth;
2056         int err;
2057         struct in_device *out_dev;
2058         unsigned int flags = 0;
2059         __be32 spec_dst;
2060         u32 itag;
2061
2062         /* get a working reference to the output device */
2063         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2064         if (out_dev == NULL) {
2065                 if (net_ratelimit())
2066                         printk(KERN_CRIT "Bug in ip_route_input" \
2067                                "_slow(). Please, report\n");
2068                 return -EINVAL;
2069         }
2070
2071
2072         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
2073                                   in_dev->dev, &spec_dst, &itag, skb->mark);
2074         if (err < 0) {
2075                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2076                                          saddr);
2077
2078                 goto cleanup;
2079         }
2080
2081         if (err)
2082                 flags |= RTCF_DIRECTSRC;
2083
2084         if (out_dev == in_dev && err &&
2085             (IN_DEV_SHARED_MEDIA(out_dev) ||
2086              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2087                 flags |= RTCF_DOREDIRECT;
2088
2089         if (skb->protocol != htons(ETH_P_IP)) {
2090                 /* Not IP (i.e. ARP). Do not create route, if it is
2091                  * invalid for proxy arp. DNAT routes are always valid.
2092                  *
2093                  * Proxy arp feature have been extended to allow, ARP
2094                  * replies back to the same interface, to support
2095                  * Private VLAN switch technologies. See arp.c.
2096                  */
2097                 if (out_dev == in_dev &&
2098                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2099                         err = -EINVAL;
2100                         goto cleanup;
2101                 }
2102         }
2103
2104
2105         rth = dst_alloc(&ipv4_dst_ops);
2106         if (!rth) {
2107                 err = -ENOBUFS;
2108                 goto cleanup;
2109         }
2110
2111         atomic_set(&rth->dst.__refcnt, 1);
2112         rth->dst.flags= DST_HOST;
2113         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2114                 rth->dst.flags |= DST_NOPOLICY;
2115         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2116                 rth->dst.flags |= DST_NOXFRM;
2117         rth->fl.fl4_dst = daddr;
2118         rth->rt_dst     = daddr;
2119         rth->fl.fl4_tos = tos;
2120         rth->fl.mark    = skb->mark;
2121         rth->fl.fl4_src = saddr;
2122         rth->rt_src     = saddr;
2123         rth->rt_gateway = daddr;
2124         rth->rt_iif     =
2125                 rth->fl.iif     = in_dev->dev->ifindex;
2126         rth->dst.dev    = (out_dev)->dev;
2127         dev_hold(rth->dst.dev);
2128         rth->fl.oif     = 0;
2129         rth->rt_spec_dst= spec_dst;
2130
2131         rth->dst.obsolete = -1;
2132         rth->dst.input = ip_forward;
2133         rth->dst.output = ip_output;
2134         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2135
2136         rt_set_nexthop(rth, res, itag);
2137
2138         rth->rt_flags = flags;
2139
2140         *result = rth;
2141         err = 0;
2142  cleanup:
2143         return err;
2144 }
2145
2146 static int ip_mkroute_input(struct sk_buff *skb,
2147                             struct fib_result *res,
2148                             const struct flowi *fl,
2149                             struct in_device *in_dev,
2150                             __be32 daddr, __be32 saddr, u32 tos)
2151 {
2152         struct rtable* rth = NULL;
2153         int err;
2154         unsigned hash;
2155
2156 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2157         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2158                 fib_select_multipath(fl, res);
2159 #endif
2160
2161         /* create a routing cache entry */
2162         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2163         if (err)
2164                 return err;
2165
2166         /* put it into the cache */
2167         hash = rt_hash(daddr, saddr, fl->iif,
2168                        rt_genid(dev_net(rth->dst.dev)));
2169         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2170 }
2171
2172 /*
2173  *      NOTE. We drop all the packets that has local source
2174  *      addresses, because every properly looped back packet
2175  *      must have correct destination already attached by output routine.
2176  *
2177  *      Such approach solves two big problems:
2178  *      1. Not simplex devices are handled properly.
2179  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2180  *      called with rcu_read_lock()
2181  */
2182
2183 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2184                                u8 tos, struct net_device *dev)
2185 {
2186         struct fib_result res;
2187         struct in_device *in_dev = __in_dev_get_rcu(dev);
2188         struct flowi fl = { .fl4_dst    = daddr,
2189                             .fl4_src    = saddr,
2190                             .fl4_tos    = tos,
2191                             .fl4_scope  = RT_SCOPE_UNIVERSE,
2192                             .mark = skb->mark,
2193                             .iif = dev->ifindex };
2194         unsigned        flags = 0;
2195         u32             itag = 0;
2196         struct rtable * rth;
2197         unsigned        hash;
2198         __be32          spec_dst;
2199         int             err = -EINVAL;
2200         struct net    * net = dev_net(dev);
2201
2202         /* IP on this device is disabled. */
2203
2204         if (!in_dev)
2205                 goto out;
2206
2207         /* Check for the most weird martians, which can be not detected
2208            by fib_lookup.
2209          */
2210
2211         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2212             ipv4_is_loopback(saddr))
2213                 goto martian_source;
2214
2215         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2216                 goto brd_input;
2217
2218         /* Accept zero addresses only to limited broadcast;
2219          * I even do not know to fix it or not. Waiting for complains :-)
2220          */
2221         if (ipv4_is_zeronet(saddr))
2222                 goto martian_source;
2223
2224         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2225                 goto martian_destination;
2226
2227         /*
2228          *      Now we are ready to route packet.
2229          */
2230         err = fib_lookup(net, &fl, &res);
2231         if (err != 0) {
2232                 if (!IN_DEV_FORWARD(in_dev))
2233                         goto e_hostunreach;
2234                 goto no_route;
2235         }
2236
2237         RT_CACHE_STAT_INC(in_slow_tot);
2238
2239         if (res.type == RTN_BROADCAST)
2240                 goto brd_input;
2241
2242         if (res.type == RTN_LOCAL) {
2243                 err = fib_validate_source(saddr, daddr, tos,
2244                                           net->loopback_dev->ifindex,
2245                                           dev, &spec_dst, &itag, skb->mark);
2246                 if (err < 0)
2247                         goto martian_source_keep_err;
2248                 if (err)
2249                         flags |= RTCF_DIRECTSRC;
2250                 spec_dst = daddr;
2251                 goto local_input;
2252         }
2253
2254         if (!IN_DEV_FORWARD(in_dev))
2255                 goto e_hostunreach;
2256         if (res.type != RTN_UNICAST)
2257                 goto martian_destination;
2258
2259         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2260 out:    return err;
2261
2262 brd_input:
2263         if (skb->protocol != htons(ETH_P_IP))
2264                 goto e_inval;
2265
2266         if (ipv4_is_zeronet(saddr))
2267                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2268         else {
2269                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2270                                           &itag, skb->mark);
2271                 if (err < 0)
2272                         goto martian_source_keep_err;
2273                 if (err)
2274                         flags |= RTCF_DIRECTSRC;
2275         }
2276         flags |= RTCF_BROADCAST;
2277         res.type = RTN_BROADCAST;
2278         RT_CACHE_STAT_INC(in_brd);
2279
2280 local_input:
2281         rth = dst_alloc(&ipv4_dst_ops);
2282         if (!rth)
2283                 goto e_nobufs;
2284
2285         rth->dst.output= ip_rt_bug;
2286         rth->dst.obsolete = -1;
2287         rth->rt_genid = rt_genid(net);
2288
2289         atomic_set(&rth->dst.__refcnt, 1);
2290         rth->dst.flags= DST_HOST;
2291         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2292                 rth->dst.flags |= DST_NOPOLICY;
2293         rth->fl.fl4_dst = daddr;
2294         rth->rt_dst     = daddr;
2295         rth->fl.fl4_tos = tos;
2296         rth->fl.mark    = skb->mark;
2297         rth->fl.fl4_src = saddr;
2298         rth->rt_src     = saddr;
2299 #ifdef CONFIG_IP_ROUTE_CLASSID
2300         rth->dst.tclassid = itag;
2301 #endif
2302         rth->rt_iif     =
2303         rth->fl.iif     = dev->ifindex;
2304         rth->dst.dev    = net->loopback_dev;
2305         dev_hold(rth->dst.dev);
2306         rth->rt_gateway = daddr;
2307         rth->rt_spec_dst= spec_dst;
2308         rth->dst.input= ip_local_deliver;
2309         rth->rt_flags   = flags|RTCF_LOCAL;
2310         if (res.type == RTN_UNREACHABLE) {
2311                 rth->dst.input= ip_error;
2312                 rth->dst.error= -err;
2313                 rth->rt_flags   &= ~RTCF_LOCAL;
2314         }
2315         rth->rt_type    = res.type;
2316         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2317         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2318         goto out;
2319
2320 no_route:
2321         RT_CACHE_STAT_INC(in_no_route);
2322         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2323         res.type = RTN_UNREACHABLE;
2324         if (err == -ESRCH)
2325                 err = -ENETUNREACH;
2326         goto local_input;
2327
2328         /*
2329          *      Do not cache martian addresses: they should be logged (RFC1812)
2330          */
2331 martian_destination:
2332         RT_CACHE_STAT_INC(in_martian_dst);
2333 #ifdef CONFIG_IP_ROUTE_VERBOSE
2334         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2335                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2336                         &daddr, &saddr, dev->name);
2337 #endif
2338
2339 e_hostunreach:
2340         err = -EHOSTUNREACH;
2341         goto out;
2342
2343 e_inval:
2344         err = -EINVAL;
2345         goto out;
2346
2347 e_nobufs:
2348         err = -ENOBUFS;
2349         goto out;
2350
2351 martian_source:
2352         err = -EINVAL;
2353 martian_source_keep_err:
2354         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2355         goto out;
2356 }
2357
2358 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2359                            u8 tos, struct net_device *dev, bool noref)
2360 {
2361         struct rtable * rth;
2362         unsigned        hash;
2363         int iif = dev->ifindex;
2364         struct net *net;
2365         int res;
2366
2367         net = dev_net(dev);
2368
2369         rcu_read_lock();
2370
2371         if (!rt_caching(net))
2372                 goto skip_cache;
2373
2374         tos &= IPTOS_RT_MASK;
2375         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2376
2377         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2378              rth = rcu_dereference(rth->dst.rt_next)) {
2379                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2380                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2381                      (rth->fl.iif ^ iif) |
2382                      rth->fl.oif |
2383                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2384                     rth->fl.mark == skb->mark &&
2385                     net_eq(dev_net(rth->dst.dev), net) &&
2386                     !rt_is_expired(rth)) {
2387                         if (noref) {
2388                                 dst_use_noref(&rth->dst, jiffies);
2389                                 skb_dst_set_noref(skb, &rth->dst);
2390                         } else {
2391                                 dst_use(&rth->dst, jiffies);
2392                                 skb_dst_set(skb, &rth->dst);
2393                         }
2394                         RT_CACHE_STAT_INC(in_hit);
2395                         rcu_read_unlock();
2396                         return 0;
2397                 }
2398                 RT_CACHE_STAT_INC(in_hlist_search);
2399         }
2400
2401 skip_cache:
2402         /* Multicast recognition logic is moved from route cache to here.
2403            The problem was that too many Ethernet cards have broken/missing
2404            hardware multicast filters :-( As result the host on multicasting
2405            network acquires a lot of useless route cache entries, sort of
2406            SDR messages from all the world. Now we try to get rid of them.
2407            Really, provided software IP multicast filter is organized
2408            reasonably (at least, hashed), it does not result in a slowdown
2409            comparing with route cache reject entries.
2410            Note, that multicast routers are not affected, because
2411            route cache entry is created eventually.
2412          */
2413         if (ipv4_is_multicast(daddr)) {
2414                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2415
2416                 if (in_dev) {
2417                         int our = ip_check_mc(in_dev, daddr, saddr,
2418                                               ip_hdr(skb)->protocol);
2419                         if (our
2420 #ifdef CONFIG_IP_MROUTE
2421                                 ||
2422                             (!ipv4_is_local_multicast(daddr) &&
2423                              IN_DEV_MFORWARD(in_dev))
2424 #endif
2425                            ) {
2426                                 int res = ip_route_input_mc(skb, daddr, saddr,
2427                                                             tos, dev, our);
2428                                 rcu_read_unlock();
2429                                 return res;
2430                         }
2431                 }
2432                 rcu_read_unlock();
2433                 return -EINVAL;
2434         }
2435         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2436         rcu_read_unlock();
2437         return res;
2438 }
2439 EXPORT_SYMBOL(ip_route_input_common);
2440
2441 /* called with rcu_read_lock() */
2442 static int __mkroute_output(struct rtable **result,
2443                             struct fib_result *res,
2444                             const struct flowi *fl,
2445                             const struct flowi *oldflp,
2446                             struct net_device *dev_out,
2447                             unsigned flags)
2448 {
2449         struct rtable *rth;
2450         struct in_device *in_dev;
2451         u32 tos = RT_FL_TOS(oldflp);
2452
2453         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2454                 return -EINVAL;
2455
2456         if (ipv4_is_lbcast(fl->fl4_dst))
2457                 res->type = RTN_BROADCAST;
2458         else if (ipv4_is_multicast(fl->fl4_dst))
2459                 res->type = RTN_MULTICAST;
2460         else if (ipv4_is_zeronet(fl->fl4_dst))
2461                 return -EINVAL;
2462
2463         if (dev_out->flags & IFF_LOOPBACK)
2464                 flags |= RTCF_LOCAL;
2465
2466         in_dev = __in_dev_get_rcu(dev_out);
2467         if (!in_dev)
2468                 return -EINVAL;
2469
2470         if (res->type == RTN_BROADCAST) {
2471                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2472                 res->fi = NULL;
2473         } else if (res->type == RTN_MULTICAST) {
2474                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2475                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2476                                  oldflp->proto))
2477                         flags &= ~RTCF_LOCAL;
2478                 /* If multicast route do not exist use
2479                  * default one, but do not gateway in this case.
2480                  * Yes, it is hack.
2481                  */
2482                 if (res->fi && res->prefixlen < 4)
2483                         res->fi = NULL;
2484         }
2485
2486
2487         rth = dst_alloc(&ipv4_dst_ops);
2488         if (!rth)
2489                 return -ENOBUFS;
2490
2491         atomic_set(&rth->dst.__refcnt, 1);
2492         rth->dst.flags= DST_HOST;
2493         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2494                 rth->dst.flags |= DST_NOXFRM;
2495         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2496                 rth->dst.flags |= DST_NOPOLICY;
2497
2498         rth->fl.fl4_dst = oldflp->fl4_dst;
2499         rth->fl.fl4_tos = tos;
2500         rth->fl.fl4_src = oldflp->fl4_src;
2501         rth->fl.oif     = oldflp->oif;
2502         rth->fl.mark    = oldflp->mark;
2503         rth->rt_dst     = fl->fl4_dst;
2504         rth->rt_src     = fl->fl4_src;
2505         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2506         /* get references to the devices that are to be hold by the routing
2507            cache entry */
2508         rth->dst.dev    = dev_out;
2509         dev_hold(dev_out);
2510         rth->rt_gateway = fl->fl4_dst;
2511         rth->rt_spec_dst= fl->fl4_src;
2512
2513         rth->dst.output=ip_output;
2514         rth->dst.obsolete = -1;
2515         rth->rt_genid = rt_genid(dev_net(dev_out));
2516
2517         RT_CACHE_STAT_INC(out_slow_tot);
2518
2519         if (flags & RTCF_LOCAL) {
2520                 rth->dst.input = ip_local_deliver;
2521                 rth->rt_spec_dst = fl->fl4_dst;
2522         }
2523         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2524                 rth->rt_spec_dst = fl->fl4_src;
2525                 if (flags & RTCF_LOCAL &&
2526                     !(dev_out->flags & IFF_LOOPBACK)) {
2527                         rth->dst.output = ip_mc_output;
2528                         RT_CACHE_STAT_INC(out_slow_mc);
2529                 }
2530 #ifdef CONFIG_IP_MROUTE
2531                 if (res->type == RTN_MULTICAST) {
2532                         if (IN_DEV_MFORWARD(in_dev) &&
2533                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2534                                 rth->dst.input = ip_mr_input;
2535                                 rth->dst.output = ip_mc_output;
2536                         }
2537                 }
2538 #endif
2539         }
2540
2541         rt_set_nexthop(rth, res, 0);
2542
2543         rth->rt_flags = flags;
2544         *result = rth;
2545         return 0;
2546 }
2547
2548 /* called with rcu_read_lock() */
2549 static int ip_mkroute_output(struct rtable **rp,
2550                              struct fib_result *res,
2551                              const struct flowi *fl,
2552                              const struct flowi *oldflp,
2553                              struct net_device *dev_out,
2554                              unsigned flags)
2555 {
2556         struct rtable *rth = NULL;
2557         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2558         unsigned hash;
2559         if (err == 0) {
2560                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2561                                rt_genid(dev_net(dev_out)));
2562                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2563         }
2564
2565         return err;
2566 }
2567
2568 /*
2569  * Major route resolver routine.
2570  * called with rcu_read_lock();
2571  */
2572
2573 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2574                                 const struct flowi *oldflp)
2575 {
2576         u32 tos = RT_FL_TOS(oldflp);
2577         struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2578                             .fl4_src = oldflp->fl4_src,
2579                             .fl4_tos = tos & IPTOS_RT_MASK,
2580                             .fl4_scope = ((tos & RTO_ONLINK) ?
2581                                           RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2582                             .mark = oldflp->mark,
2583                             .iif = net->loopback_dev->ifindex,
2584                             .oif = oldflp->oif };
2585         struct fib_result res;
2586         unsigned int flags = 0;
2587         struct net_device *dev_out = NULL;
2588         int err;
2589
2590
2591         res.fi          = NULL;
2592 #ifdef CONFIG_IP_MULTIPLE_TABLES
2593         res.r           = NULL;
2594 #endif
2595
2596         if (oldflp->fl4_src) {
2597                 err = -EINVAL;
2598                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2599                     ipv4_is_lbcast(oldflp->fl4_src) ||
2600                     ipv4_is_zeronet(oldflp->fl4_src))
2601                         goto out;
2602
2603                 /* I removed check for oif == dev_out->oif here.
2604                    It was wrong for two reasons:
2605                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2606                       is assigned to multiple interfaces.
2607                    2. Moreover, we are allowed to send packets with saddr
2608                       of another iface. --ANK
2609                  */
2610
2611                 if (oldflp->oif == 0 &&
2612                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2613                      ipv4_is_lbcast(oldflp->fl4_dst))) {
2614                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2615                         dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2616                         if (dev_out == NULL)
2617                                 goto out;
2618
2619                         /* Special hack: user can direct multicasts
2620                            and limited broadcast via necessary interface
2621                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2622                            This hack is not just for fun, it allows
2623                            vic,vat and friends to work.
2624                            They bind socket to loopback, set ttl to zero
2625                            and expect that it will work.
2626                            From the viewpoint of routing cache they are broken,
2627                            because we are not allowed to build multicast path
2628                            with loopback source addr (look, routing cache
2629                            cannot know, that ttl is zero, so that packet
2630                            will not leave this host and route is valid).
2631                            Luckily, this hack is good workaround.
2632                          */
2633
2634                         fl.oif = dev_out->ifindex;
2635                         goto make_route;
2636                 }
2637
2638                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2639                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2640                         if (!__ip_dev_find(net, oldflp->fl4_src, false))
2641                                 goto out;
2642                 }
2643         }
2644
2645
2646         if (oldflp->oif) {
2647                 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2648                 err = -ENODEV;
2649                 if (dev_out == NULL)
2650                         goto out;
2651
2652                 /* RACE: Check return value of inet_select_addr instead. */
2653                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2654                         err = -ENETUNREACH;
2655                         goto out;
2656                 }
2657                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2658                     ipv4_is_lbcast(oldflp->fl4_dst)) {
2659                         if (!fl.fl4_src)
2660                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2661                                                               RT_SCOPE_LINK);
2662                         goto make_route;
2663                 }
2664                 if (!fl.fl4_src) {
2665                         if (ipv4_is_multicast(oldflp->fl4_dst))
2666                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2667                                                               fl.fl4_scope);
2668                         else if (!oldflp->fl4_dst)
2669                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2670                                                               RT_SCOPE_HOST);
2671                 }
2672         }
2673
2674         if (!fl.fl4_dst) {
2675                 fl.fl4_dst = fl.fl4_src;
2676                 if (!fl.fl4_dst)
2677                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2678                 dev_out = net->loopback_dev;
2679                 fl.oif = net->loopback_dev->ifindex;
2680                 res.type = RTN_LOCAL;
2681                 flags |= RTCF_LOCAL;
2682                 goto make_route;
2683         }
2684
2685         if (fib_lookup(net, &fl, &res)) {
2686                 res.fi = NULL;
2687                 if (oldflp->oif) {
2688                         /* Apparently, routing tables are wrong. Assume,
2689                            that the destination is on link.
2690
2691                            WHY? DW.
2692                            Because we are allowed to send to iface
2693                            even if it has NO routes and NO assigned
2694                            addresses. When oif is specified, routing
2695                            tables are looked up with only one purpose:
2696                            to catch if destination is gatewayed, rather than
2697                            direct. Moreover, if MSG_DONTROUTE is set,
2698                            we send packet, ignoring both routing tables
2699                            and ifaddr state. --ANK
2700
2701
2702                            We could make it even if oif is unknown,
2703                            likely IPv6, but we do not.
2704                          */
2705
2706                         if (fl.fl4_src == 0)
2707                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2708                                                               RT_SCOPE_LINK);
2709                         res.type = RTN_UNICAST;
2710                         goto make_route;
2711                 }
2712                 err = -ENETUNREACH;
2713                 goto out;
2714         }
2715
2716         if (res.type == RTN_LOCAL) {
2717                 if (!fl.fl4_src) {
2718                         if (res.fi->fib_prefsrc)
2719                                 fl.fl4_src = res.fi->fib_prefsrc;
2720                         else
2721                                 fl.fl4_src = fl.fl4_dst;
2722                 }
2723                 dev_out = net->loopback_dev;
2724                 fl.oif = dev_out->ifindex;
2725                 res.fi = NULL;
2726                 flags |= RTCF_LOCAL;
2727                 goto make_route;
2728         }
2729
2730 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2731         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2732                 fib_select_multipath(&fl, &res);
2733         else
2734 #endif
2735         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2736                 fib_select_default(&res);
2737
2738         if (!fl.fl4_src)
2739                 fl.fl4_src = FIB_RES_PREFSRC(res);
2740
2741         dev_out = FIB_RES_DEV(res);
2742         fl.oif = dev_out->ifindex;
2743
2744
2745 make_route:
2746         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2747
2748 out:    return err;
2749 }
2750
2751 int __ip_route_output_key(struct net *net, struct rtable **rp,
2752                           const struct flowi *flp)
2753 {
2754         unsigned int hash;
2755         int res;
2756         struct rtable *rth;
2757
2758         if (!rt_caching(net))
2759                 goto slow_output;
2760
2761         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2762
2763         rcu_read_lock_bh();
2764         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2765                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2766                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2767                     rth->fl.fl4_src == flp->fl4_src &&
2768                     rt_is_output_route(rth) &&
2769                     rth->fl.oif == flp->oif &&
2770                     rth->fl.mark == flp->mark &&
2771                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2772                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2773                     net_eq(dev_net(rth->dst.dev), net) &&
2774                     !rt_is_expired(rth)) {
2775                         dst_use(&rth->dst, jiffies);
2776                         RT_CACHE_STAT_INC(out_hit);
2777                         rcu_read_unlock_bh();
2778                         *rp = rth;
2779                         return 0;
2780                 }
2781                 RT_CACHE_STAT_INC(out_hlist_search);
2782         }
2783         rcu_read_unlock_bh();
2784
2785 slow_output:
2786         rcu_read_lock();
2787         res = ip_route_output_slow(net, rp, flp);
2788         rcu_read_unlock();
2789         return res;
2790 }
2791 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2792
2793 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2794 {
2795         return NULL;
2796 }
2797
2798 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2799 {
2800         return 0;
2801 }
2802
2803 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2804 {
2805 }
2806
2807 static struct dst_ops ipv4_dst_blackhole_ops = {
2808         .family                 =       AF_INET,
2809         .protocol               =       cpu_to_be16(ETH_P_IP),
2810         .destroy                =       ipv4_dst_destroy,
2811         .check                  =       ipv4_blackhole_dst_check,
2812         .default_mtu            =       ipv4_blackhole_default_mtu,
2813         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2814 };
2815
2816
2817 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2818 {
2819         struct rtable *ort = *rp;
2820         struct rtable *rt = (struct rtable *)
2821                 dst_alloc(&ipv4_dst_blackhole_ops);
2822
2823         if (rt) {
2824                 struct dst_entry *new = &rt->dst;
2825
2826                 atomic_set(&new->__refcnt, 1);
2827                 new->__use = 1;
2828                 new->input = dst_discard;
2829                 new->output = dst_discard;
2830                 dst_copy_metrics(new, &ort->dst);
2831
2832                 new->dev = ort->dst.dev;
2833                 if (new->dev)
2834                         dev_hold(new->dev);
2835
2836                 rt->fl = ort->fl;
2837
2838                 rt->rt_genid = rt_genid(net);
2839                 rt->rt_flags = ort->rt_flags;
2840                 rt->rt_type = ort->rt_type;
2841                 rt->rt_dst = ort->rt_dst;
2842                 rt->rt_src = ort->rt_src;
2843                 rt->rt_iif = ort->rt_iif;
2844                 rt->rt_gateway = ort->rt_gateway;
2845                 rt->rt_spec_dst = ort->rt_spec_dst;
2846                 rt->peer = ort->peer;
2847                 if (rt->peer)
2848                         atomic_inc(&rt->peer->refcnt);
2849                 rt->fi = ort->fi;
2850                 if (rt->fi)
2851                         atomic_inc(&rt->fi->fib_clntref);
2852
2853                 dst_free(new);
2854         }
2855
2856         dst_release(&(*rp)->dst);
2857         *rp = rt;
2858         return rt ? 0 : -ENOMEM;
2859 }
2860
2861 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2862                          struct sock *sk, int flags)
2863 {
2864         int err;
2865
2866         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2867                 return err;
2868
2869         if (flp->proto) {
2870                 if (!flp->fl4_src)
2871                         flp->fl4_src = (*rp)->rt_src;
2872                 if (!flp->fl4_dst)
2873                         flp->fl4_dst = (*rp)->rt_dst;
2874                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2875                                     flags ? XFRM_LOOKUP_WAIT : 0);
2876                 if (err == -EREMOTE)
2877                         err = ipv4_dst_blackhole(net, rp, flp);
2878
2879                 return err;
2880         }
2881
2882         return 0;
2883 }
2884 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2885
2886 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2887 {
2888         return ip_route_output_flow(net, rp, flp, NULL, 0);
2889 }
2890 EXPORT_SYMBOL(ip_route_output_key);
2891
2892 static int rt_fill_info(struct net *net,
2893                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2894                         int nowait, unsigned int flags)
2895 {
2896         struct rtable *rt = skb_rtable(skb);
2897         struct rtmsg *r;
2898         struct nlmsghdr *nlh;
2899         long expires;
2900         u32 id = 0, ts = 0, tsage = 0, error;
2901
2902         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2903         if (nlh == NULL)
2904                 return -EMSGSIZE;
2905
2906         r = nlmsg_data(nlh);
2907         r->rtm_family    = AF_INET;
2908         r->rtm_dst_len  = 32;
2909         r->rtm_src_len  = 0;
2910         r->rtm_tos      = rt->fl.fl4_tos;
2911         r->rtm_table    = RT_TABLE_MAIN;
2912         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2913         r->rtm_type     = rt->rt_type;
2914         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2915         r->rtm_protocol = RTPROT_UNSPEC;
2916         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2917         if (rt->rt_flags & RTCF_NOTIFY)
2918                 r->rtm_flags |= RTM_F_NOTIFY;
2919
2920         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2921
2922         if (rt->fl.fl4_src) {
2923                 r->rtm_src_len = 32;
2924                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2925         }
2926         if (rt->dst.dev)
2927                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2928 #ifdef CONFIG_IP_ROUTE_CLASSID
2929         if (rt->dst.tclassid)
2930                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2931 #endif
2932         if (rt_is_input_route(rt))
2933                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2934         else if (rt->rt_src != rt->fl.fl4_src)
2935                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2936
2937         if (rt->rt_dst != rt->rt_gateway)
2938                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2939
2940         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2941                 goto nla_put_failure;
2942
2943         if (rt->fl.mark)
2944                 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2945
2946         error = rt->dst.error;
2947         expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2948         if (rt->peer) {
2949                 inet_peer_refcheck(rt->peer);
2950                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2951                 if (rt->peer->tcp_ts_stamp) {
2952                         ts = rt->peer->tcp_ts;
2953                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2954                 }
2955         }
2956
2957         if (rt_is_input_route(rt)) {
2958 #ifdef CONFIG_IP_MROUTE
2959                 __be32 dst = rt->rt_dst;
2960
2961                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2962                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2963                         int err = ipmr_get_route(net, skb, r, nowait);
2964                         if (err <= 0) {
2965                                 if (!nowait) {
2966                                         if (err == 0)
2967                                                 return 0;
2968                                         goto nla_put_failure;
2969                                 } else {
2970                                         if (err == -EMSGSIZE)
2971                                                 goto nla_put_failure;
2972                                         error = err;
2973                                 }
2974                         }
2975                 } else
2976 #endif
2977                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2978         }
2979
2980         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2981                                expires, error) < 0)
2982                 goto nla_put_failure;
2983
2984         return nlmsg_end(skb, nlh);
2985
2986 nla_put_failure:
2987         nlmsg_cancel(skb, nlh);
2988         return -EMSGSIZE;
2989 }
2990
2991 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2992 {
2993         struct net *net = sock_net(in_skb->sk);
2994         struct rtmsg *rtm;
2995         struct nlattr *tb[RTA_MAX+1];
2996         struct rtable *rt = NULL;
2997         __be32 dst = 0;
2998         __be32 src = 0;
2999         u32 iif;
3000         int err;
3001         int mark;
3002         struct sk_buff *skb;
3003
3004         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3005         if (err < 0)
3006                 goto errout;
3007
3008         rtm = nlmsg_data(nlh);
3009
3010         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3011         if (skb == NULL) {
3012                 err = -ENOBUFS;
3013                 goto errout;
3014         }
3015
3016         /* Reserve room for dummy headers, this skb can pass
3017            through good chunk of routing engine.
3018          */
3019         skb_reset_mac_header(skb);
3020         skb_reset_network_header(skb);
3021
3022         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3023         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3024         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3025
3026         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3027         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3028         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3029         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3030
3031         if (iif) {
3032                 struct net_device *dev;
3033
3034                 dev = __dev_get_by_index(net, iif);
3035                 if (dev == NULL) {
3036                         err = -ENODEV;
3037                         goto errout_free;
3038                 }
3039
3040                 skb->protocol   = htons(ETH_P_IP);
3041                 skb->dev        = dev;
3042                 skb->mark       = mark;
3043                 local_bh_disable();
3044                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3045                 local_bh_enable();
3046
3047                 rt = skb_rtable(skb);
3048                 if (err == 0 && rt->dst.error)
3049                         err = -rt->dst.error;
3050         } else {
3051                 struct flowi fl = {
3052                         .fl4_dst = dst,
3053                         .fl4_src = src,
3054                         .fl4_tos = rtm->rtm_tos,
3055                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3056                         .mark = mark,
3057                 };
3058                 err = ip_route_output_key(net, &rt, &fl);
3059         }
3060
3061         if (err)
3062                 goto errout_free;
3063
3064         skb_dst_set(skb, &rt->dst);
3065         if (rtm->rtm_flags & RTM_F_NOTIFY)
3066                 rt->rt_flags |= RTCF_NOTIFY;
3067
3068         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3069                            RTM_NEWROUTE, 0, 0);
3070         if (err <= 0)
3071                 goto errout_free;
3072
3073         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3074 errout:
3075         return err;
3076
3077 errout_free:
3078         kfree_skb(skb);
3079         goto errout;
3080 }
3081
3082 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3083 {
3084         struct rtable *rt;
3085         int h, s_h;
3086         int idx, s_idx;
3087         struct net *net;
3088
3089         net = sock_net(skb->sk);
3090
3091         s_h = cb->args[0];
3092         if (s_h < 0)
3093                 s_h = 0;
3094         s_idx = idx = cb->args[1];
3095         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3096                 if (!rt_hash_table[h].chain)
3097                         continue;
3098                 rcu_read_lock_bh();
3099                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3100                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3101                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3102                                 continue;
3103                         if (rt_is_expired(rt))
3104                                 continue;
3105                         skb_dst_set_noref(skb, &rt->dst);
3106                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3107                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3108                                          1, NLM_F_MULTI) <= 0) {
3109                                 skb_dst_drop(skb);
3110                                 rcu_read_unlock_bh();
3111                                 goto done;
3112                         }
3113                         skb_dst_drop(skb);
3114                 }
3115                 rcu_read_unlock_bh();
3116         }
3117
3118 done:
3119         cb->args[0] = h;
3120         cb->args[1] = idx;
3121         return skb->len;
3122 }
3123
3124 void ip_rt_multicast_event(struct in_device *in_dev)
3125 {
3126         rt_cache_flush(dev_net(in_dev->dev), 0);
3127 }
3128
3129 #ifdef CONFIG_SYSCTL
3130 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3131                                         void __user *buffer,
3132                                         size_t *lenp, loff_t *ppos)
3133 {
3134         if (write) {
3135                 int flush_delay;
3136                 ctl_table ctl;
3137                 struct net *net;
3138
3139                 memcpy(&ctl, __ctl, sizeof(ctl));
3140                 ctl.data = &flush_delay;
3141                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3142
3143                 net = (struct net *)__ctl->extra1;
3144                 rt_cache_flush(net, flush_delay);
3145                 return 0;
3146         }
3147
3148         return -EINVAL;
3149 }
3150
3151 static ctl_table ipv4_route_table[] = {
3152         {
3153                 .procname       = "gc_thresh",
3154                 .data           = &ipv4_dst_ops.gc_thresh,
3155                 .maxlen         = sizeof(int),
3156                 .mode           = 0644,
3157                 .proc_handler   = proc_dointvec,
3158         },
3159         {
3160                 .procname       = "max_size",
3161                 .data           = &ip_rt_max_size,
3162                 .maxlen         = sizeof(int),
3163                 .mode           = 0644,
3164                 .proc_handler   = proc_dointvec,
3165         },
3166         {
3167                 /*  Deprecated. Use gc_min_interval_ms */
3168
3169                 .procname       = "gc_min_interval",
3170                 .data           = &ip_rt_gc_min_interval,
3171                 .maxlen         = sizeof(int),
3172                 .mode           = 0644,
3173                 .proc_handler   = proc_dointvec_jiffies,
3174         },
3175         {
3176                 .procname       = "gc_min_interval_ms",
3177                 .data           = &ip_rt_gc_min_interval,
3178                 .maxlen         = sizeof(int),
3179                 .mode           = 0644,
3180                 .proc_handler   = proc_dointvec_ms_jiffies,
3181         },
3182         {
3183                 .procname       = "gc_timeout",
3184                 .data           = &ip_rt_gc_timeout,
3185                 .maxlen         = sizeof(int),
3186                 .mode           = 0644,
3187                 .proc_handler   = proc_dointvec_jiffies,
3188         },
3189         {
3190                 .procname       = "gc_interval",
3191                 .data           = &ip_rt_gc_interval,
3192                 .maxlen         = sizeof(int),
3193                 .mode           = 0644,
3194                 .proc_handler   = proc_dointvec_jiffies,
3195         },
3196         {
3197                 .procname       = "redirect_load",
3198                 .data           = &ip_rt_redirect_load,
3199                 .maxlen         = sizeof(int),
3200                 .mode           = 0644,
3201                 .proc_handler   = proc_dointvec,
3202         },
3203         {
3204                 .procname       = "redirect_number",
3205                 .data           = &ip_rt_redirect_number,
3206                 .maxlen         = sizeof(int),
3207                 .mode           = 0644,
3208                 .proc_handler   = proc_dointvec,
3209         },
3210         {
3211                 .procname       = "redirect_silence",
3212                 .data           = &ip_rt_redirect_silence,
3213                 .maxlen         = sizeof(int),
3214                 .mode           = 0644,
3215                 .proc_handler   = proc_dointvec,
3216         },
3217         {
3218                 .procname       = "error_cost",
3219                 .data           = &ip_rt_error_cost,
3220                 .maxlen         = sizeof(int),
3221                 .mode           = 0644,
3222                 .proc_handler   = proc_dointvec,
3223         },
3224         {
3225                 .procname       = "error_burst",
3226                 .data           = &ip_rt_error_burst,
3227                 .maxlen         = sizeof(int),
3228                 .mode           = 0644,
3229                 .proc_handler   = proc_dointvec,
3230         },
3231         {
3232                 .procname       = "gc_elasticity",
3233                 .data           = &ip_rt_gc_elasticity,
3234                 .maxlen         = sizeof(int),
3235                 .mode           = 0644,
3236                 .proc_handler   = proc_dointvec,
3237         },
3238         {
3239                 .procname       = "mtu_expires",
3240                 .data           = &ip_rt_mtu_expires,
3241                 .maxlen         = sizeof(int),
3242                 .mode           = 0644,
3243                 .proc_handler   = proc_dointvec_jiffies,
3244         },
3245         {
3246                 .procname       = "min_pmtu",
3247                 .data           = &ip_rt_min_pmtu,
3248                 .maxlen         = sizeof(int),
3249                 .mode           = 0644,
3250                 .proc_handler   = proc_dointvec,
3251         },
3252         {
3253                 .procname       = "min_adv_mss",
3254                 .data           = &ip_rt_min_advmss,
3255                 .maxlen         = sizeof(int),
3256                 .mode           = 0644,
3257                 .proc_handler   = proc_dointvec,
3258         },
3259         { }
3260 };
3261
3262 static struct ctl_table empty[1];
3263
3264 static struct ctl_table ipv4_skeleton[] =
3265 {
3266         { .procname = "route",
3267           .mode = 0555, .child = ipv4_route_table},
3268         { .procname = "neigh",
3269           .mode = 0555, .child = empty},
3270         { }
3271 };
3272
3273 static __net_initdata struct ctl_path ipv4_path[] = {
3274         { .procname = "net", },
3275         { .procname = "ipv4", },
3276         { },
3277 };
3278
3279 static struct ctl_table ipv4_route_flush_table[] = {
3280         {
3281                 .procname       = "flush",
3282                 .maxlen         = sizeof(int),
3283                 .mode           = 0200,
3284                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3285         },
3286         { },
3287 };
3288
3289 static __net_initdata struct ctl_path ipv4_route_path[] = {
3290         { .procname = "net", },
3291         { .procname = "ipv4", },
3292         { .procname = "route", },
3293         { },
3294 };
3295
3296 static __net_init int sysctl_route_net_init(struct net *net)
3297 {
3298         struct ctl_table *tbl;
3299
3300         tbl = ipv4_route_flush_table;
3301         if (!net_eq(net, &init_net)) {
3302                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3303                 if (tbl == NULL)
3304                         goto err_dup;
3305         }
3306         tbl[0].extra1 = net;
3307
3308         net->ipv4.route_hdr =
3309                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3310         if (net->ipv4.route_hdr == NULL)
3311                 goto err_reg;
3312         return 0;
3313
3314 err_reg:
3315         if (tbl != ipv4_route_flush_table)
3316                 kfree(tbl);
3317 err_dup:
3318         return -ENOMEM;
3319 }
3320
3321 static __net_exit void sysctl_route_net_exit(struct net *net)
3322 {
3323         struct ctl_table *tbl;
3324
3325         tbl = net->ipv4.route_hdr->ctl_table_arg;
3326         unregister_net_sysctl_table(net->ipv4.route_hdr);
3327         BUG_ON(tbl == ipv4_route_flush_table);
3328         kfree(tbl);
3329 }
3330
3331 static __net_initdata struct pernet_operations sysctl_route_ops = {
3332         .init = sysctl_route_net_init,
3333         .exit = sysctl_route_net_exit,
3334 };
3335 #endif
3336
3337 static __net_init int rt_genid_init(struct net *net)
3338 {
3339         get_random_bytes(&net->ipv4.rt_genid,
3340                          sizeof(net->ipv4.rt_genid));
3341         return 0;
3342 }
3343
3344 static __net_initdata struct pernet_operations rt_genid_ops = {
3345         .init = rt_genid_init,
3346 };
3347
3348
3349 #ifdef CONFIG_IP_ROUTE_CLASSID
3350 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3351 #endif /* CONFIG_IP_ROUTE_CLASSID */
3352
3353 static __initdata unsigned long rhash_entries;
3354 static int __init set_rhash_entries(char *str)
3355 {
3356         if (!str)
3357                 return 0;
3358         rhash_entries = simple_strtoul(str, &str, 0);
3359         return 1;
3360 }
3361 __setup("rhash_entries=", set_rhash_entries);
3362
3363 int __init ip_rt_init(void)
3364 {
3365         int rc = 0;
3366
3367 #ifdef CONFIG_IP_ROUTE_CLASSID
3368         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3369         if (!ip_rt_acct)
3370                 panic("IP: failed to allocate ip_rt_acct\n");
3371 #endif
3372
3373         ipv4_dst_ops.kmem_cachep =
3374                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3375                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3376
3377         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3378
3379         if (dst_entries_init(&ipv4_dst_ops) < 0)
3380                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3381
3382         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3383                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3384
3385         rt_hash_table = (struct rt_hash_bucket *)
3386                 alloc_large_system_hash("IP route cache",
3387                                         sizeof(struct rt_hash_bucket),
3388                                         rhash_entries,
3389                                         (totalram_pages >= 128 * 1024) ?
3390                                         15 : 17,
3391                                         0,
3392                                         &rt_hash_log,
3393                                         &rt_hash_mask,
3394                                         rhash_entries ? 0 : 512 * 1024);
3395         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3396         rt_hash_lock_init();
3397
3398         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3399         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3400
3401         devinet_init();
3402         ip_fib_init();
3403
3404         /* All the timers, started at system startup tend
3405            to synchronize. Perturb it a bit.
3406          */
3407         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3408         expires_ljiffies = jiffies;
3409         schedule_delayed_work(&expires_work,
3410                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3411
3412         if (ip_rt_proc_init())
3413                 printk(KERN_ERR "Unable to create route proc files\n");
3414 #ifdef CONFIG_XFRM
3415         xfrm_init();
3416         xfrm4_init(ip_rt_max_size);
3417 #endif
3418         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3419
3420 #ifdef CONFIG_SYSCTL
3421         register_pernet_subsys(&sysctl_route_ops);
3422 #endif
3423         register_pernet_subsys(&rt_genid_ops);
3424         return rc;
3425 }
3426
3427 #ifdef CONFIG_SYSCTL
3428 /*
3429  * We really need to sanitize the damn ipv4 init order, then all
3430  * this nonsense will go away.
3431  */
3432 void __init ip_static_sysctl_init(void)
3433 {
3434         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3435 }
3436 #endif